├── tests
    ├── __init__.py
    ├── pytest.ini
    ├── neox_args
    │   ├── __init__.py
    │   ├── test_neoxargs_implementation.py
    │   ├── test_neoxargs_usage.py
    │   ├── test_neoxargs_commandline.py
    │   └── test_neoxargs_load.py
    ├── model
    │   ├── __init__.py
    │   ├── test_model_instantiation.py
    │   ├── test_model_generation.py
    │   ├── test_model_checkpoint.py
    │   └── test_model_train.py
    ├── Readme.md
    └── test_configs
    │   └── test_train_base.yml
├── tools
    ├── kill.sh
    ├── killall.sh
    ├── sync_cmd.sh
    ├── sync.sh
    └── syncdir.sh
├── CODEOWNERS
├── megatron
    ├── data
    │   ├── __init__.py
    │   ├── Makefile
    │   ├── blendable_dataset.py
    │   └── samplers.py
    ├── gradient_noise_scale
    │   └── __init__.py
    ├── tokenizer
    │   ├── __init__.py
    │   └── train_tokenizer.py
    ├── model
    │   ├── __init__.py
    │   ├── fused_bias_dropout.py
    │   ├── norms.py
    │   ├── init_functions.py
    │   ├── activations.py
    │   ├── gmlp.py
    │   └── positional_embeddings.py
    ├── fused_kernels
    │   ├── compat.h
    │   ├── __init__.py
    │   ├── setup.py
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   ├── scaled_masked_softmax_cuda.cu
    │   └── type_shim.h
    ├── mpu
    │   ├── random.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── data.py
    │   ├── cross_entropy.py
    │   └── mappings.py
    ├── __init__.py
    ├── neox_arguments
    │   ├── template.py
    │   ├── __init__.py
    │   └── deepspeed_args.py
    └── learning_rates.py
├── requirements
    ├── requirements-sparseattention.txt
    ├── requirements-onebitadam.txt
    ├── requirements-tensorboard.txt
    ├── requirements-dev.txt
    └── requirements.txt
├── MANIFEST.in
├── eval_tasks
    └── __init__.py
├── test.py
├── myscripts
    └── tokenizer_downloads.sh
├── .github
    ├── workflows
    │   ├── pull_request.yml
    │   └── docker_build.yml
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── create_tokenizer.py
├── configs
    ├── text_generation.yml
    ├── sparse.yml
    ├── eleutherai_cluster.yml
    ├── myconfigs
    │   ├── data_config.yml
    │   ├── local_setup.yml
    │   ├── small.yml
    │   ├── model_config.yml
    │   └── 20B.yml
    ├── local_setup.yml
    ├── gmlp_small.yml
    ├── small.yml
    ├── 13B.yml
    ├── XL.yml
    ├── 175B.yml
    ├── 2-7B.yml
    ├── 6-7B.yml
    ├── large.yml
    ├── medium.yml
    ├── bnb_small.yml
    ├── small_bf16.yml
    ├── 20B.yml
    └── gen_docs.py
├── .pre-commit-config.yaml
├── train.py
├── deepy.py
├── prepare_data.py
├── CITATION.cff
├── evaluate.py
├── .gitignore
├── generate.py
├── Dockerfile
└── .clang-format


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/kill.sh:
--------------------------------------------------------------------------------
1 | pkill -9 python
2 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @EleutherAI/pm-gptneo
2 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-sparseattention.txt:
--------------------------------------------------------------------------------
1 | triton==0.4.2
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-onebitadam.txt:
--------------------------------------------------------------------------------
1 | cupy-cuda111==8.6.0
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-tensorboard.txt:
--------------------------------------------------------------------------------
1 | tensorboard==2.5.0
2 | 


--------------------------------------------------------------------------------
/tools/killall.sh:
--------------------------------------------------------------------------------
1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | 


--------------------------------------------------------------------------------
/eval_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness
2 | 


--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     cpu: marks tests that can be run on cpu
4 | 


--------------------------------------------------------------------------------
/megatron/gradient_noise_scale/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_noise_scale import GradientNoiseScale
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8==1.5.6
2 | pytest==6.2.3
3 | pytest-cov==2.11.1
4 | pytest-forked==1.3.0
5 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer
2 | 
3 | tokenizer = AutoTokenizer.from_pretrained('downloads/20B_tokenizer.json')


--------------------------------------------------------------------------------
/tests/neox_args/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | testing of implementation of command line arguments and configuration (NeoXArgs)
3 | """
4 | 


--------------------------------------------------------------------------------
/tests/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .test_model_instantiation import run_test_model_instantiation
2 | from .test_model_train import run_train_test
3 | from .test_model_checkpoint import run_checkpoint_test
4 | 


--------------------------------------------------------------------------------
/tools/sync_cmd.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Runs a command in parallel across all nodes
4 | # Usage
5 | # sync_cmd.sh 'echo "hello world"'
6 | 
7 | echo "Command: $1";
8 | pdsh -R ssh -w ^/job/hosts $1
9 | 


--------------------------------------------------------------------------------
/myscripts/tokenizer_downloads.sh:
--------------------------------------------------------------------------------
1 | mkdir data
2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
3 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
4 | mv gpt2-vocab.json data
5 | mv gpt2-merges.txt data


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | name: Pull Request
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-20.04
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - uses: actions/setup-python@v2
11 |         with:
12 |           python-version: 3.8
13 |       - uses: pre-commit/action@v2.0.3
14 | 


--------------------------------------------------------------------------------
/tools/sync.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Push files to all nodes
 4 | # Usage
 5 | # sync.sh file [file2..]
 6 | 
 7 | echo Number of files to upload: $#
 8 | 
 9 | for file in "$@"
10 | do
11 |     full_path=$(realpath $file)
12 |     echo Uploading $full_path
13 |     pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
14 | done
15 | 


--------------------------------------------------------------------------------
/tools/syncdir.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Push files to all nodes
 4 | # Usage
 5 | # sync.sh file [file2..]
 6 | 
 7 | echo Number of files to upload: $#
 8 | 
 9 | for file in "$@"
10 | do
11 |     full_path=$(realpath $file)
12 |     parentdir="$(dirname "$full_path")"
13 |     echo Uploading $full_path to $parentdir
14 |     pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
15 | done
16 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_implementation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | check implementation of NeoXArgs for duplication errors (would overwrite)
 3 | """
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.mark.cpu
 8 | def test_neoxargs_duplicates():
 9 |     """
10 |     tests that there are no duplicates among parent classes of NeoXArgs
11 |     """
12 |     from megatron import NeoXArgs
13 | 
14 |     assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates"
15 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | git+git://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed
 2 | einops==0.3.0
 3 | ftfy==6.0.1
 4 | lm_dataformat==0.0.19
 5 | git+https://github.com/EleutherAI/lm-evaluation-harness.git@dc937d4b70af819c5695e09d94e59e4cdb1e40ad#egg=lm_eval
 6 | mpi4py==3.0.3
 7 | numpy==1.21.0
 8 | pybind11==2.6.2
 9 | regex
10 | sentencepiece
11 | six
12 | tokenizers==0.10.2
13 | transformers==4.5.0
14 | wandb==0.10.28
15 | 


--------------------------------------------------------------------------------
/create_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from tokenizers import Tokenizer
 2 | from megatron.tokenizer.tokenizer import HFTokenizer
 3 | 
 4 | 
 5 | filepath = 'data/tokenizer/20B_tokenizer.json'
 6 | tokenizer = HFTokenizer(filepath)
 7 | hello_ids = tokenizer.tokenize("hello")
 8 | print(hello_ids)
 9 | 
10 | ids = tokenizer.tokenize('}{')
11 | print(ids)
12 | ids = tokenizer.tokenize('---')
13 | print(ids)
14 | start_ids = tokenizer.tokenize('<|startoftext|>')
15 | end_ids = tokenizer.tokenize('<|endoftext|>')
16 | print(start_ids)
17 | print(end_ids)


--------------------------------------------------------------------------------
/configs/text_generation.yml:
--------------------------------------------------------------------------------
 1 | # Parameters used for text generation
 2 | # Make sure `load` is specified somewhere else
 3 | {
 4 |   # Text gen type: `input-file`, `unconditional` or `interactive`
 5 |   "text-gen-type": "unconditional",
 6 | 
 7 |   # Params for all
 8 |   "maximum_tokens": 102,
 9 |   "temperature": 1.0,
10 |   "top_p": .9,
11 |   "top_k": 50,
12 |   "recompute": false,
13 | 
14 |   # `unconditional`: samples
15 |   "num-samples": 5,
16 | 
17 |   # input/output file
18 |   "sample-input-file": "sample_input.txt",
19 |   "sample-output-file": "sample_output.txt",
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/sparse.yml:
--------------------------------------------------------------------------------
 1 | # Add this to your config for sparse attention every other layer
 2 | {
 3 |   "attention_config": [[["local", "global"], "all"]],
 4 | 
 5 |   # sparsity config:
 6 |   # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
 7 |   # illustrative purposes)
 8 |   # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
 9 |   # more detailed config instructions and available parameters
10 | 
11 |   "sparsity_config": {
12 |     "block": 16, # block size
13 |     "num_local_blocks": 32,
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: feature request
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from .tokenizer import build_tokenizer
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Proposed solution**
24 | If you have an idea for how we can fix this problem, describe it here.
25 | 
26 | **Screenshots**
27 | If applicable, add screenshots to help explain your problem.
28 | 
29 | **Environment (please complete the following information):**
30 |  - GPUs:
31 | - Configs:
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from .gpt2_model import GPT2ModelPipe
19 | from .utils import get_params_for_weight_decay_optimization
20 | from .word_embeddings import SoftEmbedding
21 | 


--------------------------------------------------------------------------------
/configs/eleutherai_cluster.yml:
--------------------------------------------------------------------------------
 1 | # Data paths and options when using EleutherAI cluster
 2 | {
 3 |   "data-path": "/mnt/ssd-1/data/enron/enron_text_document",
 4 |   # or for weighted datasets:
 5 |   # "train-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
 6 |   # "test-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
 7 |   # "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
 8 |   # "train-data-weights": [1., 2.],
 9 |   # "test-data-weights": [2., 1.],
10 |   # "valid-data-weights": [0.5, 0.4],
11 | 
12 |   "vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json",
13 |   "merge-file": "/mnt/ssd-1/data/gpt2-merges.txt",
14 |   "save": "/mnt/ssd-1/checkpoints",
15 |   "load": "/mnt/ssd-1/checkpoints",
16 |   "tensorboard-dir": "/mnt/ssd-1/tensorboard",
17 |   "log-dir": "/mnt/ssd-1/logs",
18 |   "wandb_team": "eleutherai",
19 | }
20 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v4.1.0
 4 |       hooks:
 5 |           - id: check-case-conflict
 6 |           - id: check-json
 7 |           - id: check-symlinks
 8 |           - id: check-yaml
 9 |           - id: destroyed-symlinks
10 |           - id: end-of-file-fixer
11 |             exclude: docs/CNAME
12 |           - id: fix-byte-order-marker
13 |           - id: fix-encoding-pragma
14 |             args: [--remove]
15 |           - id: mixed-line-ending
16 |             args: [--fix=lf]
17 |           - id: requirements-txt-fixer
18 |           - id: trailing-whitespace
19 |     - repo: https://gitlab.com/daverona/pre-commit-cpp
20 |       rev: 0.8.0
21 |       hooks:
22 |           - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
23 |             args: []
24 | 
25 |     - repo: https://github.com/psf/black
26 |       rev: 21.8b0
27 |       hooks:
28 |           - id: black
29 |             language_version: python3.8
30 | 


--------------------------------------------------------------------------------
/megatron/mpu/random.py:
--------------------------------------------------------------------------------
 1 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
 2 | # TODO: should be able to get rid of this file entirely
 3 | 
 4 | import deepspeed
 5 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
 6 | 
 7 | # Default name for the model parallel rng tracker.
 8 | _MODEL_PARALLEL_RNG_TRACKER_NAME = (
 9 |     deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
10 | )
11 | 
12 | # Whether apply model parallelsim to checkpointed hidden states.
13 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
14 | 
15 | # RNG tracker object.
16 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
17 | 
18 | # Deepspeed checkpointing functions
19 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones
20 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state
21 | checkpoint = checkpointing.checkpoint
22 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
23 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
24 | 


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | 
16 | 
17 | def print_rank_0(*message):
18 |     """If distributed is initialized print only on rank 0."""
19 |     if torch.distributed.is_initialized():
20 |         if torch.distributed.get_rank() == 0:
21 |             print(*message, flush=True)
22 |     else:
23 |         print(*message, flush=True)
24 | 
25 | 
26 | from .initialize import initialize_megatron
27 | from .neox_arguments import NeoXArgs
28 | 


--------------------------------------------------------------------------------
/megatron/neox_arguments/template.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import logging
 3 | 
 4 | 
 5 | @dataclass
 6 | class NeoXArgsTemplate:
 7 |     def defaults(self):
 8 |         """
 9 |         generator for getting default values.
10 |         """
11 |         for key, field_def in self.__dataclass_fields__.items():
12 |             yield key, field_def.default
13 | 
14 |     def update_value(self, key: str, value):
15 |         """
16 |         updates a property value if the key already exists
17 | 
18 |         Problem: a previously non-existing property can be added to the class instance without error.
19 |         """
20 |         if hasattr(self, key):
21 |             setattr(self, key, value)
22 |         else:
23 |             error_message = (
24 |                 self.__class__.__name__
25 |                 + ".update_value() to be updated property "
26 |                 + str(key)
27 |                 + " does not exist"
28 |             )
29 |             logging.error(error_message)
30 |             raise ValueError(error_message)
31 | 
32 |     def update_values(self, d):
33 |         """
34 |         Updates multiple values in self if the keys already exists
35 |         """
36 |         for k, v in d.items():
37 |             self.update_value(k, v)
38 | 


--------------------------------------------------------------------------------
/configs/myconfigs/data_config.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data-path": "data/train_data/passage_crits_text",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 7 |   # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 8 |   # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab-file": "data/tokenizer/20B_tokenizer.json",
19 | 
20 |   "save": "checkpoints",
21 |   "load": "checkpoints",
22 |   "checkpoint_validation_with_forward_pass": False,
23 | 
24 |   "tensorboard-dir": "tensorboard",
25 |   "log-dir": "logs",
26 |   "use_wandb": True,
27 |   "wandb_host": "https://api.wandb.ai",
28 |   "wandb_project": "neox"
29 | }
30 | 


--------------------------------------------------------------------------------
/configs/local_setup.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data-path": "data/enron/enron_text_document",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 7 |   # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 8 |   # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab-file": "data/gpt2-vocab.json",
19 |   "merge-file": "data/gpt2-merges.txt",
20 | 
21 |   "save": "checkpoints",
22 |   "load": "checkpoints",
23 |   "checkpoint_validation_with_forward_pass": False,
24 | 
25 |   "tensorboard-dir": "tensorboard",
26 |   "log-dir": "logs",
27 |   "use_wandb": True,
28 |   "wandb_host": "https://api.wandb.ai",
29 |   "wandb_project": "neox"
30 | }
31 | 


--------------------------------------------------------------------------------
/configs/myconfigs/local_setup.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data-path": "data/enron/enron_text_document",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 7 |   # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 8 |   # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab-file": "data/gpt2-vocab.json",
19 |   "merge-file": "data/gpt2-merges.txt",
20 | 
21 |   "save": "checkpoints",
22 |   "load": "checkpoints",
23 |   "checkpoint_validation_with_forward_pass": False,
24 | 
25 |   "tensorboard-dir": "tensorboard",
26 |   "log-dir": "logs",
27 |   "use_wandb": True,
28 |   "wandb_host": "https://api.wandb.ai",
29 |   "wandb_project": "neox"
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/Readme.md:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | 
 3 | Tests use pytests with coverage and forked plugins. Install with:
 4 | 
 5 | ```bash
 6 | pip install -r requirements/requirements-dev.txt
 7 | ```
 8 | 
 9 | # Run
10 | 
11 | Tests can be run using pytest.
12 | 
13 | * The argument --forked needs to be provided
14 | * A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation)
15 | * A subset of tests can be selected by pointing to the module within tests
16 | 
17 | ```bash
18 | # run all tests, output coverage report of megatron module in terminal
19 | pytest --forked --cov-report term --cov=megatron tests
20 | 
21 | # run tests in tests/model, output coverage report of megatron module as html
22 | pytest --forked --cov-report html --cov=megatron tests/model
23 | 
24 | # run tests in tests/model/test_model_generation.py, don't output coverage report
25 | pytest --forked tests/model/test_model_generation.py
26 | ```
27 | 
28 | Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
29 | The test cases for cpu can be run with:
30 | ````
31 | pytest tests -m cpu
32 | ```
33 | 
34 | If a html coverage report has been created a simple http server can be run to serve static files.
35 | 
36 | ```bash
37 | python -m http.server --directory htmlcov 8000
38 | ```
39 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI contributors
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Pretrain"""
19 | from megatron.neox_arguments import NeoXArgs
20 | from megatron.training import pretrain
21 | 
22 | if __name__ == "__main__":
23 |     neox_args = NeoXArgs.consume_neox_args()
24 |     neox_args.configure_distributed_args()
25 |     neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
26 |     neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
27 |     pretrain(neox_args=neox_args)
28 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typing import Optional
 4 | from torch import Tensor
 5 | 
 6 | # flags required to enable jit fusion kernels
 7 | torch._C._jit_set_profiling_mode(False)
 8 | torch._C._jit_set_profiling_executor(False)
 9 | torch._C._jit_override_can_fuse_on_cpu(True)
10 | torch._C._jit_override_can_fuse_on_gpu(True)
11 | 
12 | 
13 | def bias_dropout_add(
14 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool
15 | ) -> Tensor:
16 |     out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
17 |     if residual is not None:
18 |         out = residual + out
19 |     return out
20 | 
21 | 
22 | def get_bias_dropout_add(training):
23 |     def _bias_dropout_add(x, bias, residual, prob):
24 |         return bias_dropout_add(x, bias, residual, prob, training)
25 | 
26 |     return _bias_dropout_add
27 | 
28 | 
29 | @torch.jit.script
30 | def bias_dropout_add_fused_train(
31 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
32 | ) -> Tensor:
33 |     return bias_dropout_add(x, bias, residual, prob, True)
34 | 
35 | 
36 | @torch.jit.script
37 | def bias_dropout_add_fused_inference(
38 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
39 | ) -> Tensor:
40 |     return bias_dropout_add(x, bias, residual, prob, False)
41 | 


--------------------------------------------------------------------------------
/deepy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) 2021, EleutherAI contributors
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | import os
18 | 
19 | import deepspeed
20 | from deepspeed.launcher.runner import main
21 | 
22 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
23 | 
24 | from megatron.neox_arguments import NeoXArgs
25 | from megatron.utils import get_wandb_api_key
26 | 
27 | 
28 | neox_args = NeoXArgs.consume_deepy_args()
29 | deepspeed_main_args = neox_args.get_deepspeed_main_args()
30 | 
31 | # Extract wandb API key and inject into worker environments
32 | wandb_token = get_wandb_api_key(neox_args=neox_args)
33 | if wandb_token is not None:
34 |     deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35 |     os.environ["WANDB_API_KEY"] = wandb_token
36 | 
37 | if __name__ == "__main__":
38 |     main(deepspeed_main_args)
39 | 


--------------------------------------------------------------------------------
/.github/workflows/docker_build.yml:
--------------------------------------------------------------------------------
 1 | name: docker_build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - '**'
 7 | 
 8 | jobs:
 9 |   main:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       -
13 |         name: Checkout
14 |         uses: actions/checkout@v2
15 | 
16 |       -
17 |         name: Docker meta
18 |         id: docker_meta
19 |         uses: crazy-max/ghaction-docker-meta@v1
20 |         with:
21 |           images: leogao2/gpt-neox # list of Docker images to use as base name for tags
22 |           tag-sha: true # add git short SHA as Docker tag
23 | 
24 |       -
25 |         name: Set up QEMU
26 |         uses: docker/setup-qemu-action@v1
27 | 
28 |       -
29 |         name: Set up Docker Buildx
30 |         uses: docker/setup-buildx-action@v1
31 | 
32 |       -
33 |         name: Login to DockerHub
34 |         uses: docker/login-action@v1
35 |         with:
36 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
37 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
38 | 
39 |       -
40 |         name: Build and push
41 |         id: docker_build
42 |         uses: docker/build-push-action@v2
43 |         with:
44 |           push: ${{ github.event_name != 'pull_request' }}
45 |           tags: ${{ steps.docker_meta.outputs.tags }}
46 |           labels: ${{ steps.docker_meta.outputs.labels }}
47 | 
48 |       -
49 |         name: Image digest
50 |         run: echo ${{ steps.docker_build.outputs.digest }}
51 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import pathlib
17 | import subprocess
18 | 
19 | from torch.utils import cpp_extension
20 | from pathlib import Path
21 | 
22 | srcpath = Path(__file__).parent.absolute()
23 | 
24 | # Setting this param to a list has a problem of generating different
25 | # compilation commands (with diferent order of architectures) and
26 | # leading to recompilation of fused kernels. Set it to empty string
27 | # to avoid recompilation and assign arch flags explicity in
28 | # extra_cuda_cflags below
29 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
30 | 
31 | def load_fused_kernels():
32 |     try:
33 |         import scaled_upper_triang_masked_softmax_cuda
34 |         import scaled_masked_softmax_cuda
35 |     except (ImportError, ModuleNotFoundError):
36 |         print("\n")
37 |         print("=" * 100)
38 |         print(f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them')
39 |         print("=" * 100)
40 |         exit()
41 |     return
42 | 


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
 1 | from tools.corpora import prepare_dataset, DATA_DOWNLOADERS
 2 | import argparse
 3 | 
 4 | TOKENIZER_CHOICES = [
 5 |     "HFGPT2Tokenizer",
 6 |     "HFTokenizer",
 7 |     "GPT2BPETokenizer",
 8 |     "CharLevelTokenizer",
 9 | ]
10 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"]
11 | 
12 | 
13 | def get_args():
14 |     parser = argparse.ArgumentParser(description="Download & preprocess neox datasets")
15 |     parser.add_argument(
16 |         "dataset",
17 |         nargs="?",
18 |         default="enron",
19 |         help="name of dataset to download.",
20 |         choices=DATASET_CHOICES,
21 |     )
22 |     parser.add_argument(
23 |         "-t",
24 |         "--tokenizer",
25 |         default="GPT2BPETokenizer",
26 |         choices=TOKENIZER_CHOICES,
27 |         help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}',
28 |     )
29 |     parser.add_argument(
30 |         "-d",
31 |         "--data-dir",
32 |         default=None,
33 |         help=f"Directory to which to download datasets / tokenizer "
34 |         f"files - defaults to ./data",
35 |     )
36 |     parser.add_argument(
37 |         "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)"
38 |     )
39 |     parser.add_argument(
40 |         "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)"
41 |     )
42 |     return parser.parse_args()
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     args = get_args()
47 |     prepare_dataset(
48 |         dataset_name=args.dataset,
49 |         tokenizer_type=args.tokenizer,
50 |         data_dir=args.data_dir,
51 |         vocab_file=args.vocab_file,
52 |         merge_file=args.merge_file,
53 |     )
54 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # YAML 1.2
 2 | ---
 3 | authors:
 4 |   - affiliation: EleutherAI
 5 |     family-names: Andonian
 6 |     given-names: Alex
 7 |   - affiliation: EleutherAI
 8 |     family-names: Biderman
 9 |     given-names: Stella
10 |   - affiliation: EleutherAI
11 |     family-names: Black
12 |     given-names: Sid
13 |   - affiliation: EleutherAI
14 |     family-names: Gali
15 |     given-names: Preetham
16 |   - affiliation: EleutherAI
17 |     family-names: Gao
18 |     given-names: Leo
19 |   - affiliation: EleutherAI
20 |     family-names: Hallahan
21 |     given-names: Eric
22 |   - affiliation: EleutherAI
23 |     family-names: Levy-Kramer
24 |     given-names: Josh
25 |   - affiliation: EleutherAI
26 |     family-names: Leahy
27 |     given-names: Connor
28 |   - affiliation: EleutherAI
29 |     family-names: Nestler
30 |     given-names: Lucas
31 |   - affiliation: EleutherAI
32 |     family-names: Parker
33 |     given-names: Kip
34 |   - affiliation: EleutherAI
35 |     family-names: Pieler
36 |     given-names: Michael
37 |   - affiliation: EleutherAI
38 |     family-names: Purohit
39 |     given-names: Shivanshu
40 |   - affiliation: EleutherAI
41 |     family-names: Songz
42 |     given-names: Tri
43 |   - affiliation: EleutherAI
44 |     family-names: Phil
45 |     given-names: Wang
46 |   - affiliation: EleutherAI
47 |     family-names: Weinbach
48 |     given-names: Samuel
49 | cff-version: "1.1.0"
50 | keywords:
51 |   - Transformers
52 |   - "Massive language model"
53 |   - "Autoregressive language model"
54 | license: "Apache-2.0"
55 | message: "If you use this software, please cite it using these metadata."
56 | repository-code: "https://www.github.com/eleutherai/gpt-neox"
57 | title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch"
58 | version: "0.0.1"
59 | date-released: 2021-08-23
60 | ...
61 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI contributors
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Evaluation tasks - modified from https://github.com/EleutherAI/lm-evaluation-harness"""
19 | 
20 | import os
21 | import sys
22 | 
23 | sys.path.append(
24 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
25 | )
26 | from megatron.training import forward_step
27 | from megatron.utils import setup_for_inference_or_eval
28 | from eval_tasks import run_eval_harness
29 | from pprint import pprint
30 | from datetime import datetime
31 | import json
32 | 
33 | 
34 | def main():
35 |     model, neox_args = setup_for_inference_or_eval(use_cache=False)
36 |     results = run_eval_harness(
37 |         model,
38 |         forward_step,
39 |         neox_args,
40 |         eval_tasks=neox_args.eval_tasks,
41 |         bootstrap_iters=10000,
42 |     )
43 |     if neox_args.rank == 0:
44 |         pprint(results)
45 |         results_path = (
46 |             f'eval_results_{datetime.now().strftime("%m-%d-%Y-%H-%M-%S")}.json'
47 |         )
48 |         if neox_args.eval_results_prefix:
49 |             results_path = f"{neox_args.eval_results_prefix}_{results_path}"
50 |         with open(results_path, "w") as f:
51 |             json.dump(results, f, indent=4)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/configs/gmlp_small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 |    "attention_config": [[["gmlp"], "all"]],
 8 | 
 9 | 
10 |    # model settings
11 |    "num-layers": 12,
12 |    "hidden-size": 768, # gmlp d_ff defaults to hidden_size * 4
13 |    "gmlp_attn_dim": 64,
14 |    "num-attention-heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
15 |    "seq-length": 2048,
16 |    "max-position-embeddings": 2048,
17 |    "norm": "layernorm",
18 |    "pos-emb": "none",
19 |    "no-weight-tying": true,
20 | 
21 |    # optimizer settings
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.0006,
26 |        "betas": [0.9, 0.999],
27 |        "eps": 1.0e-8,
28 |      }
29 |    },
30 | 
31 |    # batch / data settings
32 |    "train_micro_batch_size_per_gpu": 4,
33 |    "data-impl": "mmap",
34 |    "split": "949,50,1",
35 | 
36 |    # activation checkpointing
37 |    "checkpoint-activations": true,
38 |    "checkpoint-num-layers": 1,
39 |    "partition-activations": false,
40 |    "synchronize-each-layer": true,
41 | 
42 |    # regularization
43 |    "gradient_clipping": 1.0,
44 |    "weight-decay": 0.1,
45 |    "hidden-dropout": 0.0,
46 |    "attention-dropout": 0.0,
47 | 
48 |    # precision settings
49 |    "fp16": {
50 |      "enabled": true,
51 |      "loss_scale": 0,
52 |      "loss_scale_window": 1000,
53 |      "hysteresis": 2,
54 |      "min_loss_scale": 1
55 |    },
56 | 
57 |    # misc. training settings
58 |    "train-iters": 320000,
59 |    "lr-decay-iters": 320000,
60 |    "distributed-backend": "nccl",
61 |    "lr-decay-style": "cosine",
62 |    "warmup": 0.01,
63 |    "save-interval": 10000,
64 |    "eval-interval": 1000,
65 |    "eval-iters": 10,
66 | 
67 |    # logging
68 |    "log-interval": 100,
69 |    "steps_per_print": 10,
70 |    "keep-last-n-checkpoints": 4,
71 |    "wall_clock_breakdown": true,
72 | }
73 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_usage.py:
--------------------------------------------------------------------------------
 1 | """
 2 | plausibility check for the usage of neox_args in the megatron codebase
 3 | """
 4 | import pytest
 5 | import re
 6 | from ..common import get_root_directory
 7 | 
 8 | 
 9 | @pytest.mark.cpu
10 | def test_neoxargs_usage():
11 |     """ "
12 |     checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs
13 |     """
14 |     from megatron.neox_arguments import NeoXArgs
15 | 
16 |     declared_all = True
17 |     neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys())
18 | 
19 |     # we exlude a number of properties (implemented with the @property decorator) or functions that we know exists
20 |     exclude = set(
21 |         [
22 |             "params_dtype",
23 |             "deepspeed_config",
24 |             "get",
25 |             "pop",
26 |             "get_deepspeed_main_args",
27 |             'optimizer["params"]',
28 |             "attention_config[layer_number]",
29 |             "adlr_autoresume_object",
30 |             "update_value",
31 |             "all_config",
32 |             "tensorboard_writer",
33 |             "tokenizer",
34 |             "train_batch_size]",
35 |         ]
36 |     )
37 | 
38 |     # test file by file
39 |     for filename in (get_root_directory() / "megatron").glob("**/*.py"):
40 |         if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]:
41 |             continue
42 | 
43 |         # load file
44 |         with open(filename, "r") as f:
45 |             file_contents = f.read()
46 | 
47 |         # find args matches
48 |         matches = list(
49 |             re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=])", file_contents)
50 |         )
51 |         if len(matches) == 0:
52 |             continue
53 | 
54 |         # compare
55 |         for match in matches:
56 |             if match not in neox_args_attributes and match not in exclude:
57 |                 print(
58 |                     f"(arguments used not found in neox args): {filename.name}: {match}",
59 |                     flush=True,
60 |                 )
61 |                 declared_all = False
62 | 
63 |     assert declared_all, "all arguments used in code defined in NeoXArgs"
64 | 


--------------------------------------------------------------------------------
/configs/small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.0006,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 0,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0.0,
57 |    "hidden-dropout": 0.0,
58 |    "attention-dropout": 0.0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "enabled": true,
63 |      "loss_scale": 0,
64 |      "loss_scale_window": 1000,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    # misc. training settings
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "save-interval": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 | 
79 |    # logging
80 |    "log-interval": 100,
81 |    "steps_per_print": 10,
82 |    "keep-last-n-checkpoints": 4,
83 |    "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Model parallel utility interface."""
16 | 
17 | from .cross_entropy import vocab_parallel_cross_entropy
18 | 
19 | from .data import broadcast_data
20 | 
21 | from .initialize import is_unitialized
22 | from .initialize import destroy_model_parallel
23 | from .initialize import get_data_parallel_group
24 | from .initialize import get_data_parallel_rank
25 | from .initialize import get_data_parallel_world_size
26 | from .initialize import get_model_parallel_group
27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
30 | from .initialize import get_topology
31 | from .initialize import get_pipe_parallel_group
32 | from .initialize import get_pipe_parallel_rank
33 | from .initialize import get_pipe_parallel_world_size
34 | from .initialize import get_io_parallel_group
35 | from .initialize import initialize_model_parallel
36 | from .initialize import model_parallel_is_initialized
37 | 
38 | from .layers import ColumnParallelLinear
39 | from .layers import RowParallelLinear
40 | from .layers import VocabParallelEmbedding
41 | from .layers import ParallelRelativePositionBias
42 | 
43 | from .mappings import copy_to_model_parallel_region
44 | from .mappings import gather_from_model_parallel_region
45 | from .mappings import reduce_from_model_parallel_region
46 | from .mappings import scatter_to_model_parallel_region
47 | 
48 | from .random import checkpoint
49 | from .random import get_cuda_rng_tracker
50 | from .random import model_parallel_cuda_manual_seed
51 | 
52 | from .utils import divide
53 | from .utils import split_tensor_along_last_dim
54 | 


--------------------------------------------------------------------------------
/configs/13B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 40,
10 |    "hidden-size": 5120,
11 |    "num-attention-heads": 40,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 |    # optimizer settings
23 |    "optimizer": {
24 |      "type": "Adam",
25 |      "params": {
26 |        "lr": 0.0001,
27 |        "betas": [0.9, 0.999],
28 |        "eps": 1.0e-8,
29 |      }
30 |    },
31 |    "zero_optimization": {
32 |     "stage": 1,
33 |     "allgather_partitions": True,
34 |     "allgather_bucket_size": 500000000,
35 |     "overlap_comm": True,
36 |     "reduce_scatter": True,
37 |     "reduce_bucket_size": 500000000,
38 |     "contiguous_gradients": True,
39 |     "cpu_offload": False
40 |   },
41 | 
42 |    # batch / data settings
43 |    "train_micro_batch_size_per_gpu": 4,
44 |    "data-impl": "mmap",
45 |    "split": "949,50,1",
46 | 
47 |    # activation checkpointing
48 |    "checkpoint-activations": true,
49 |    "checkpoint-num-layers": 1,
50 |    "partition-activations": true,
51 |    "synchronize-each-layer": true,
52 | 
53 |    # regularization
54 |    "gradient_clipping": 1.0,
55 |    "weight-decay": 0,
56 |    "hidden-dropout": 0,
57 |    "attention-dropout": 0,
58 | 
59 |    # precision settings
60 |    "fp16": {
61 |      "fp16": true,
62 |      "enabled": true,
63 |      "loss_scale": 0,
64 |      "loss_scale_window": 1000,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    # misc. training settings
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "save-interval": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 | 
79 |    # logging
80 |    "log-interval": 100,
81 |    "steps_per_print": 10,
82 |    "keep-last-n-checkpoints": 4,
83 |    "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/XL.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 24,
10 |    "hidden-size": 2048,
11 |    "num-attention-heads": 16,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 |    # optimizer settings
23 |    "optimizer": {
24 |      "type": "Adam",
25 |      "params": {
26 |        "lr": 0.0002,
27 |        "betas": [0.9, 0.999],
28 |        "eps":  1.0e-8,
29 |      }
30 |    },
31 |    "zero_optimization": {
32 |     "stage": 1,
33 |     "allgather_partitions": True,
34 |     "allgather_bucket_size": 500000000,
35 |     "overlap_comm": True,
36 |     "reduce_scatter": True,
37 |     "reduce_bucket_size": 500000000,
38 |     "contiguous_gradients": True,
39 |     "cpu_offload": False
40 |   },
41 | 
42 |    # batch / data settings
43 |    "train_micro_batch_size_per_gpu": 4,
44 |    "data-impl": "mmap",
45 |    "split": "949,50,1",
46 | 
47 |    # activation checkpointing
48 |    "checkpoint-activations": true,
49 |    "checkpoint-num-layers": 1,
50 |    "partition-activations": true,
51 |    "synchronize-each-layer": true,
52 | 
53 |    # regularization
54 |    "gradient_clipping": 1.0,
55 |    "weight-decay": 0,
56 |    "hidden-dropout": 0,
57 |    "attention-dropout": 0,
58 | 
59 |    # precision settings
60 |    "fp16": {
61 |      "fp16": true,
62 |      "enabled": true,
63 |      "loss_scale": 0,
64 |      "loss_scale_window": 1000,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    # misc. training settings
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "save-interval": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 | 
79 |    # logging
80 |    "log-interval": 100,
81 |    "steps_per_print": 10,
82 |    "keep-last-n-checkpoints": 4,
83 |    "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/myconfigs/small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.0006,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 0,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0.0,
57 |    "hidden-dropout": 0.0,
58 |    "attention-dropout": 0.0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "enabled": true,
63 |      "loss_scale": 0,
64 |      "loss_scale_window": 1000,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    # misc. training settings
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "save-interval": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 | 
79 |    # logging
80 |    "log-interval": 100,
81 |    "steps_per_print": 10,
82 |    "keep-last-n-checkpoints": 4,
83 |    "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/175B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 96,
10 |    "hidden-size": 12288,
11 |    "num-attention-heads": 96,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 |    # optimizer settings
23 |    "optimizer": {
24 |      "type": "Adam",
25 |      "params": {
26 |        "lr": 0.00006,
27 |        "betas": [0.9, 0.999],
28 |        "eps": 1.0e-8,
29 |      }
30 |    },
31 |    "zero_optimization": {
32 |     "stage": 1,
33 |     "allgather_partitions": True,
34 |     "allgather_bucket_size": 500000000,
35 |     "overlap_comm": True,
36 |     "reduce_scatter": True,
37 |     "reduce_bucket_size": 500000000,
38 |     "contiguous_gradients": True,
39 |     "cpu_offload": False
40 |   },
41 | 
42 |    # batch / data settings
43 |    "train_micro_batch_size_per_gpu": 4,
44 |    "data-impl": "mmap",
45 |    "split": "949,50,1",
46 | 
47 |    # activation checkpointing
48 |    "checkpoint-activations": true,
49 |    "checkpoint-num-layers": 1,
50 |    "partition-activations": true,
51 |    "synchronize-each-layer": true,
52 | 
53 |    # regularization
54 |    "gradient_clipping": 1.0,
55 |    "weight-decay": 0,
56 |    "hidden-dropout": 0,
57 |    "attention-dropout": 0,
58 | 
59 |    # precision settings
60 |    "fp16": {
61 |      "fp16": true,
62 |      "enabled": true,
63 |      "loss_scale": 0,
64 |      "loss_scale_window": 1000,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    # misc. training settings
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "save-interval": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 | 
79 |    # logging
80 |    "log-interval": 100,
81 |    "steps_per_print": 10,
82 |    "keep-last-n-checkpoints": 4,
83 |    "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/2-7B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 32,
10 |    "hidden-size": 2560,
11 |    "num-attention-heads": 32,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.00016,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0,
57 |    "hidden-dropout": 0,
58 |    "attention-dropout": 0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "fp16": true,
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    # misc. training settings
71 |    "train-iters": 320000,
72 |    "lr-decay-iters": 320000,
73 |    "distributed-backend": "nccl",
74 |    "lr-decay-style": "cosine",
75 |    "warmup": 0.01,
76 |    "save-interval": 10000,
77 |    "eval-interval": 1000,
78 |    "eval-iters": 10,
79 | 
80 |    # logging
81 |    "log-interval": 100,
82 |    "steps_per_print": 10,
83 |    "keep-last-n-checkpoints": 4,
84 |    "wall_clock_breakdown": true,
85 | }
86 | 


--------------------------------------------------------------------------------
/configs/6-7B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 32,
10 |    "hidden-size": 4096,
11 |    "num-attention-heads": 32,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.00012,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0,
57 |    "hidden-dropout": 0,
58 |    "attention-dropout": 0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "fp16": true,
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    # misc. training settings
71 |    "train-iters": 320000,
72 |    "lr-decay-iters": 320000,
73 |    "distributed-backend": "nccl",
74 |    "lr-decay-style": "cosine",
75 |    "warmup": 0.01,
76 |    "save-interval": 10000,
77 |    "eval-interval": 1000,
78 |    "eval-iters": 10,
79 | 
80 |    # logging
81 |    "log-interval": 100,
82 |    "steps_per_print": 10,
83 |    "keep-last-n-checkpoints": 4,
84 |    "wall_clock_breakdown": true,
85 | }
86 | 


--------------------------------------------------------------------------------
/configs/large.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 24,
10 |    "hidden-size": 1536,
11 |    "num-attention-heads": 16,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.00025,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0,
57 |    "hidden-dropout": 0,
58 |    "attention-dropout": 0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "fp16": true,
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    # misc. training settings
71 |    "train-iters": 320000,
72 |    "lr-decay-iters": 320000,
73 |    "distributed-backend": "nccl",
74 |    "lr-decay-style": "cosine",
75 |    "warmup": 0.01,
76 |    "save-interval": 10000,
77 |    "eval-interval": 1000,
78 |    "eval-iters": 10,
79 | 
80 |    # logging
81 |    "log-interval": 100,
82 |    "steps_per_print": 10,
83 |    "keep-last-n-checkpoints": 4,
84 |    "wall_clock_breakdown": true,
85 | }
86 | 


--------------------------------------------------------------------------------
/configs/medium.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 24,
10 |    "hidden-size": 1024,
11 |    "num-attention-heads": 16,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 | 
24 |    # optimizer settings
25 |    "optimizer": {
26 |      "type": "Adam",
27 |      "params": {
28 |        "lr": 0.0003,
29 |        "betas": [0.9, 0.999],
30 |        "eps": 1.0e-8,
31 |      }
32 |    },
33 |    "zero_optimization": {
34 |     "stage": 1,
35 |     "allgather_partitions": True,
36 |     "allgather_bucket_size": 500000000,
37 |     "overlap_comm": True,
38 |     "reduce_scatter": True,
39 |     "reduce_bucket_size": 500000000,
40 |     "contiguous_gradients": True,
41 |     "cpu_offload": False
42 |   },
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0,
57 |    "hidden-dropout": 0,
58 |    "attention-dropout": 0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "fp16": true,
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    # misc. training settings
71 |    "train-iters": 320000,
72 |    "lr-decay-iters": 320000,
73 |    "distributed-backend": "nccl",
74 |    "lr-decay-style": "cosine",
75 |    "warmup": 0.01,
76 |    "save-interval": 10000,
77 |    "eval-interval": 1000,
78 |    "eval-iters": 10,
79 | 
80 |    # logging
81 |    "log-interval": 100,
82 |    "steps_per_print": 10,
83 |    "keep-last-n-checkpoints": 4,
84 |    "wall_clock_breakdown": true,
85 | }
86 | 


--------------------------------------------------------------------------------
/configs/myconfigs/model_config.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 4,
 6 |    "model-parallel-size": 2,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.0006,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 0,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0.0,
57 |    "hidden-dropout": 0.0,
58 |    "attention-dropout": 0.0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "enabled": true,
63 |      "loss_scale": 0,
64 |      "loss_scale_window": 1000,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    # misc. training settings
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "save-interval": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 | 
79 |    # logging
80 |    "log-interval": 100,
81 |    "steps_per_print": 10,
82 |    "keep-last-n-checkpoints": 4,
83 |    "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/bnb_small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "use-bnb-optimizer": true,
18 | 
19 |    # these should provide some speedup but takes a while to build, set to true if desired
20 |    "scaled-upper-triang-masked-softmax-fusion": false,
21 |    "bias-gelu-fusion": false,
22 | 
23 | 
24 |    # optimizer settings
25 |    "optimizer": {
26 |      "type": "Adam",
27 |      "params": {
28 |        "lr": 0.0006,
29 |        "betas": [0.9, 0.999],
30 |        "eps": 1.0e-8,
31 |      }
32 |    },
33 |    "zero_optimization": {
34 |     "stage": 0,
35 |     "allgather_partitions": True,
36 |     "allgather_bucket_size": 500000000,
37 |     "overlap_comm": True,
38 |     "reduce_scatter": True,
39 |     "reduce_bucket_size": 500000000,
40 |     "contiguous_gradients": True,
41 |     "cpu_offload": False
42 |   },
43 | 
44 |    # batch / data settings
45 |    "train_micro_batch_size_per_gpu": 4,
46 |    "data-impl": "mmap",
47 |    "split": "949,50,1",
48 | 
49 |    # activation checkpointing
50 |    "checkpoint-activations": true,
51 |    "checkpoint-num-layers": 1,
52 |    "partition-activations": true,
53 |    "synchronize-each-layer": true,
54 | 
55 |    # regularization
56 |    "gradient_clipping": 1.0,
57 |    "weight-decay": 0.0,
58 |    "hidden-dropout": 0.0,
59 |    "attention-dropout": 0.0,
60 | 
61 |    # precision settings
62 |    "fp16": {
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    # misc. training settings
71 |    "train-iters": 320000,
72 |    "lr-decay-iters": 320000,
73 |    "distributed-backend": "nccl",
74 |    "lr-decay-style": "cosine",
75 |    "warmup": 0.01,
76 |    "save-interval": 10000,
77 |    "eval-interval": 1000,
78 |    "eval-iters": 10,
79 | 
80 |    # logging
81 |    "log-interval": 100,
82 |    "steps_per_print": 10,
83 |    "keep-last-n-checkpoints": 4,
84 |    "wall_clock_breakdown": true,
85 | }
86 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from torch.utils import cpp_extension
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | from torch.cuda import is_available as torch_cuda_available
 5 | from pathlib import Path
 6 | import subprocess
 7 | 
 8 | 
 9 | def _get_cuda_bare_metal_version(cuda_dir):
10 |     raw_output = subprocess.check_output(
11 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
12 |     )
13 |     output = raw_output.split()
14 |     release_idx = output.index("release") + 1
15 |     release = output[release_idx].split(".")
16 |     bare_metal_major = release[0]
17 |     bare_metal_minor = release[1][0]
18 | 
19 |     return raw_output, bare_metal_major, bare_metal_minor
20 | 
21 | 
22 | srcpath = Path(__file__).parent.absolute()
23 | cc_flag = []
24 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
25 | if int(bare_metal_major) >= 11:
26 |     cc_flag.append("-gencode")
27 |     cc_flag.append("arch=compute_80,code=sm_80")
28 | 
29 | nvcc_flags = [
30 |     "-O3",
31 |     "-gencode",
32 |     "arch=compute_70,code=sm_70",
33 |     "--use_fast_math",
34 |     "-U__CUDA_NO_HALF_OPERATORS__",
35 |     "-U__CUDA_NO_HALF_CONVERSIONS__",
36 |     "--expt-relaxed-constexpr",
37 |     "--expt-extended-lambda",
38 | ]
39 | cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag}
40 | layernorm_cuda_args = {
41 |     "cxx": ["-O3"],
42 |     "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"],
43 | }
44 | setup(
45 |     name="fused_kernels",
46 |     version="0.0.1",
47 |     author="Sid Black & Alejandro Molina et al.",
48 |     author_email="alejandro.molina@aleph-alpha.de",
49 |     include_package_data=False,
50 |     ext_modules=[
51 |         CUDAExtension(
52 |             "scaled_upper_triang_masked_softmax_cuda",
53 |             [
54 |                 str(srcpath / "scaled_upper_triang_masked_softmax.cpp"),
55 |                 str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"),
56 |             ],
57 |             extra_compile_args=cuda_ext_args,
58 |         ),
59 |         CUDAExtension(
60 |             "scaled_masked_softmax_cuda",
61 |             [
62 |                 str(srcpath / "scaled_masked_softmax.cpp"),
63 |                 str(srcpath / "scaled_masked_softmax_cuda.cu"),
64 |             ],
65 |             extra_compile_args=cuda_ext_args,
66 |         ),
67 |     ]
68 |     if torch_cuda_available()
69 |     else [],
70 |     cmdclass={"build_ext": BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/megatron/model/norms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import LayerNorm as LayerNorm
 3 | 
 4 | 
 5 | def get_norm(neox_args):
 6 |     if neox_args.norm == "rmsnorm":
 7 |         norm = RMSNorm
 8 |         eps = neox_args.rms_norm_epsilon
 9 |     elif neox_args.norm == "layernorm":
10 |         eps = neox_args.layernorm_epsilon
11 |         norm = LayerNorm
12 |     elif neox_args.norm == "scalenorm":
13 |         eps = neox_args.scalenorm_epsilon
14 |         norm = ScaleNorm
15 |     else:
16 |         raise ValueError(f"norm {neox_args.norm} not recognized")
17 |     return norm, eps
18 | 
19 | 
20 | class RMSNorm(torch.nn.Module):
21 |     def __init__(self, dim, p=-1.0, eps=1e-8, bias=False):
22 |         """
23 |             Root Mean Square Layer Normalization
24 |         :param dim: model size
25 |         :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
26 |         :param eps:  epsilon value, default 1e-8
27 |         :param bias: whether use bias term for RMSNorm, disabled by
28 |             default because RMSNorm doesn't enforce re-centering invariance.
29 |         """
30 |         super(RMSNorm, self).__init__()
31 | 
32 |         self.eps = eps
33 |         self.d = dim
34 |         self.p = p
35 |         self.bias = bias
36 | 
37 |         self.scale = torch.nn.Parameter(torch.ones(dim))
38 |         self.register_parameter("scale", self.scale)
39 | 
40 |         if self.bias:
41 |             self.offset = torch.nn.Parameter(torch.zeros(dim))
42 |             self.register_parameter("offset", self.offset)
43 | 
44 |     def forward(self, x):
45 |         if self.p < 0.0 or self.p > 1.0:
46 |             norm_x = x.norm(2, dim=-1, keepdim=True)
47 |             d_x = self.d
48 |         else:
49 |             partial_size = int(self.d * self.p)
50 |             partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
51 | 
52 |             norm_x = partial_x.norm(2, dim=-1, keepdim=True)
53 |             d_x = partial_size
54 | 
55 |         rms_x = norm_x * d_x ** (-1.0 / 2)
56 |         x_normed = x / (rms_x + self.eps)
57 | 
58 |         if self.bias:
59 |             return self.scale * x_normed + self.offset
60 | 
61 |         return self.scale * x_normed
62 | 
63 | 
64 | class ScaleNorm(torch.nn.Module):
65 |     def __init__(self, dim, eps=1e-5):
66 |         super().__init__()
67 |         self.g = torch.nn.Parameter(torch.ones(1))
68 |         self.eps = eps
69 | 
70 |     def forward(self, x):
71 |         n = torch.norm(x, dim=-1, keepdim=True).clamp(min=self.eps)
72 |         return x / n * self.g
73 | 


--------------------------------------------------------------------------------
/configs/small_bf16.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.0006,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    "zero_optimization": {
33 |     "stage": 0,
34 |     "allgather_partitions": True,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": True,
37 |     "reduce_scatter": True,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": True,
40 |     "cpu_offload": False
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0.0,
57 |    "hidden-dropout": 0.0,
58 |    "attention-dropout": 0.0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "enabled": true,
63 |      "type": "bfloat16", # set bf16 as precision
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
71 |    # misc. training settings
72 |    "train-iters": 320000,
73 |    "lr-decay-iters": 320000,
74 |    "distributed-backend": "nccl",
75 |    "lr-decay-style": "cosine",
76 |    "warmup": 0.01,
77 |    "save-interval": 10000,
78 |    "eval-interval": 1000,
79 |    "eval-iters": 10,
80 | 
81 |    # logging
82 |    "log-interval": 100,
83 |    "steps_per_print": 10,
84 |    "keep-last-n-checkpoints": 4,
85 |    "wall_clock_breakdown": true,
86 | }
87 | 


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Blendable dataset."""
16 | 
17 | import time
18 | 
19 | import numpy as np
20 | import torch
21 | 
22 | from megatron import print_rank_0
23 | from megatron import mpu
24 | 
25 | 
26 | class BlendableDataset(torch.utils.data.Dataset):
27 |     def __init__(self, datasets, weights):
28 |         self.datasets = datasets
29 |         num_datasets = len(datasets)
30 |         assert num_datasets == len(weights)
31 | 
32 |         self.size = 0
33 |         for dataset in self.datasets:
34 |             self.size += len(dataset)
35 | 
36 |         # Normalize weights.
37 |         weights = np.array(weights, dtype=np.float64)
38 |         sum_weights = np.sum(weights)
39 |         assert sum_weights > 0.0
40 |         weights /= sum_weights
41 | 
42 |         # Build indices.
43 |         start_time = time.time()
44 |         assert num_datasets < 255
45 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
46 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
47 | 
48 |         from megatron.data import helpers
49 | 
50 |         helpers.build_blending_indices(
51 |             self.dataset_index,
52 |             self.dataset_sample_index,
53 |             weights,
54 |             num_datasets,
55 |             self.size,
56 |             torch.distributed.get_rank() == 0,
57 |         )
58 | 
59 |         print(
60 |             "> RANK {} elapsed time for building blendable dataset indices: "
61 |             "{:.2f} (sec)".format(
62 |                 torch.distributed.get_rank(), time.time() - start_time
63 |             )
64 |         )
65 | 
66 |     def __len__(self):
67 |         return self.size
68 | 
69 |     def __getitem__(self, idx):
70 |         try:
71 |             dataset_idx = self.dataset_index[idx]
72 |             sample_idx = self.dataset_sample_index[idx]
73 |             return self.datasets[dataset_idx][sample_idx]
74 |         except IndexError:
75 |             new_idx = idx % len(self)
76 |             print(
77 |                 f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
78 |             )
79 |             return self[new_idx]
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # wandb logs
132 | wandb/
133 | 
134 | # data files
135 | data/**/*.idx
136 | data/**/*.bin
137 | data/**/*.json*
138 | data/**/*.txt
139 | data/**/*.gz
140 | data/**/*.np*
141 | data/**/*.npy
142 | checkpoints/
143 | .vscode/
144 | *.pt
145 | *.ckpt
146 | 
147 | #test logs
148 | test_checkpoint/
149 | test_logs/
150 | logs/
151 | tensorboard/
152 | src/
153 | 


--------------------------------------------------------------------------------
/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import torch
17 | 
18 | 
19 | def ensure_divisibility(numerator, denominator):
20 |     """Ensure that numerator is divisible by the denominator."""
21 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
22 |         numerator, denominator
23 |     )
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
34 |     """Split a tensor along its last dimension.
35 |     Arguments:
36 |         tensor: input tensor.
37 |         num_partitions: number of partitions to split the tensor
38 |         contiguous_split_chunks: If True, make each chunk contiguous
39 |                                  in memory.
40 |     """
41 |     # Get the size and dimension.
42 |     last_dim = tensor.dim() - 1
43 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
44 |     # Split.
45 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
46 |     # Note: torch.split does not create contiguous tensors by default.
47 |     if contiguous_split_chunks:
48 |         return tuple(chunk.contiguous() for chunk in tensor_list)
49 | 
50 |     return tensor_list
51 | 
52 | 
53 | class VocabUtility:
54 |     """Split the vocabulary into `world_size` chunks amd return the
55 |     first and last index of the vocabulary belonging to the `rank`
56 |     partition: Note that indices in [first, last]"""
57 | 
58 |     @staticmethod
59 |     def vocab_range_from_per_partition_vocab_size(
60 |         per_partition_vocab_size, rank, world_size
61 |     ):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size
71 |         )
72 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);
26 | 
27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
28 |                        torch::Tensor const& softmax_results,
29 |                        float scale_factor);
30 | 
31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor)
32 | {
33 |     AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
34 |     AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
35 |                    (input.scalar_type() == at::ScalarType::BFloat16),
36 |                "Only fp16 and bf16 are supported");
37 | 
38 |     return fwd_cuda(input, scale_factor);
39 | }
40 | 
41 | torch::Tensor bwd(torch::Tensor const& output_grads,
42 |                   torch::Tensor const& softmax_results,
43 |                   float scale_factor)
44 | {
45 |     AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
46 |     AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
47 | 
48 |     AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
49 |                    (output_grads.scalar_type() == at::ScalarType::BFloat16),
50 |                "Only fp16 and bf16 are supported");
51 |     AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
52 |                    (softmax_results.scalar_type() == at::ScalarType::BFloat16),
53 |                "Only fp16 and bf16 are supported");
54 | 
55 |     return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | }  // end namespace scaled_upper_triang_masked_softmax
59 | }  // end namespace fused_softmax
60 | }  // end namespace multihead_attn
61 | 
62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
63 | {
64 |     m.def("forward",
65 |           &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
66 |           "Self Multihead Attention scaled, time masked softmax -- Forward.");
67 |     m.def("backward",
68 |           &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
69 |           "Self Multihead Attention scaled, time masked softmax -- Backward.");
70 | }
71 | 


--------------------------------------------------------------------------------
/megatron/neox_arguments/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NeoX Arguments manages all configuration arguments.
 3 | 
 4 | **general**
 5 | 
 6 | * The implementation makes use of the python dataclass.
 7 | * The main class 'NeoXArgs' (in ./arguments) exposes all configuration attributes that are relevant to GPT NeoX
 8 | * No attributes are nested (apart from attributes with type dict)
 9 | * Output functions (enable_logging, save_yml, print) are implemented
10 | * Instantiation always runs NeoXArgs.__post_init__(), which calculates derived values and performs a validation (values, types, keys).
11 | * it is possible to set undefined attributes (e.g. line of code 'NeoXArgs().my_undefined_config = 42' works fine); such set attributes are not validated
12 | * It is possible to update attributes (e.g. line of code 'NeoXArgs().do_train = True' works fine); a validation can be performed by calling the validation functions on the class instance
13 | * In order to avoid setting undefined attributes you can use the function NeoXArgs().update_value(); this function raises an error if the to be set attribute is not defined
14 | 
15 | **instantiation**
16 | NeoX args can be instantiated with the following options
17 | 
18 | * NeoXArgs.from_ymls(["path_to_yaml1", "path_to_yaml2", ...]): load yaml configuration files and instantiate with the values provided; checks for duplications and unknown arguments are performed
19 | * NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed
20 | 
21 | * NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main
22 | * NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance.
23 | 
24 | 
25 | **code structure**
26 | 
27 | * NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen
28 | * The Subclasses group args according to their purpose
29 | * The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be ommitted
30 | * The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be ommitted
31 | * calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items())
32 | * refer to docstrings in code for more information
33 | """
34 | 
35 | 
36 | from .arguments import NeoXArgs
37 | 


--------------------------------------------------------------------------------
/tests/test_configs/test_train_base.yml:
--------------------------------------------------------------------------------
  1 | # GPT_2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe_parallel_size": 0,
  6 |    "model_parallel_size": 1,
  7 | 
  8 |    # model settings
  9 |    "num_layers": 2,
 10 |    "hidden_size": 192,
 11 |    "num_attention_heads": 6,
 12 |    "seq_length": 1024,
 13 |    "max_position_embeddings": 1024,
 14 |    "norm": "layernorm",
 15 |    "pos_emb": "rotary",
 16 |    "no_weight_tying": true,
 17 | 
 18 |    # these should provide some speedup but takes a while to build, set to true if desired
 19 |    "scaled_upper_triang_masked_softmax_fusion": false,
 20 |    "bias_gelu_fusion": false,
 21 | 
 22 |    # optimizer settings
 23 |    "optimizer": {
 24 |      "type": "Adam",
 25 |      "params": {
 26 |        "lr": 0.0006,
 27 |        "betas": [0.9, 0.999],
 28 |        "eps": 1.0e-8,
 29 |      }
 30 |    },
 31 | 
 32 |    "zero_optimization": {
 33 |     "stage": 0,
 34 |     "allgather_partitions": True,
 35 |     "allgather_bucket_size": 500000000,
 36 |     "overlap_comm": True,
 37 |     "reduce_scatter": True,
 38 |     "reduce_bucket_size": 500000000,
 39 |     "contiguous_gradients": True,
 40 |     "cpu_offload": False
 41 |   },
 42 | 
 43 |    # batch / data settings
 44 |    "train_micro_batch_size_per_gpu": 4,
 45 |    "data_impl": "mmap",
 46 |    "split": "949,50,1",
 47 | 
 48 |    # activation checkpointing
 49 |    "checkpoint_activations": true,
 50 |    "checkpoint_num_layers": 1,
 51 |    "partition_activations": true,
 52 |    "synchronize_each_layer": true,
 53 | 
 54 |    # regularization
 55 |    "gradient_clipping": 1.0,
 56 |    "weight_decay": 0.0,
 57 |    "hidden_dropout": 0.0,
 58 |    "attention_dropout": 0.0,
 59 | 
 60 |    # precision settings
 61 |    "fp16": {
 62 |      "enabled": true,
 63 |      "loss_scale": 0,
 64 |      "loss_scale_window": 1000,
 65 |      "hysteresis": 2,
 66 |      "min_loss_scale": 1
 67 |    },
 68 | 
 69 |    # misc. training settings
 70 |    "train_iters": 320000,
 71 |    "lr_decay_iters": 320000,
 72 |    "distributed_backend": "nccl",
 73 |    "lr_decay_style": "cosine",
 74 |    "warmup": 0.01,
 75 |    "save_interval": 10000,
 76 |    "eval_interval": 1000,
 77 |    "eval_iters": 10,
 78 | 
 79 |    # logging
 80 |    "log_interval": 100,
 81 |    "steps_per_print": 10,
 82 |    "keep_last_n_checkpoints": 4,
 83 |    "wall_clock_breakdown": true,
 84 | 
 85 |   # Suggested data paths when using GPT_NeoX locally
 86 |   "data_path": "data/enron/enron_text_document",
 87 | 
 88 |   # or for weighted datasets:
 89 |   # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 90 |   # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 91 |   # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 92 |   # "train-data-weights": [1., 2.],
 93 |   # "test-data-weights": [2., 1.],
 94 |   # "valid-data-weights": [0.5, 0.4],
 95 | 
 96 |   "vocab_file": "data/gpt2-vocab.json",
 97 |   "merge_file": "data/gpt2-merges.txt",
 98 |   "save": "test_checkpoint",
 99 |   "load": "test_checkpoint",
100 |   "tensorboard_dir": "test_tensorboard",
101 |   "log_dir": "test_logs",
102 | 
103 | }
104 | 


--------------------------------------------------------------------------------
/configs/myconfigs/20B.yml:
--------------------------------------------------------------------------------
  1 | # DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
  2 | # GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
  3 | # the model in memory.
  4 | 
  5 | {
  6 |   # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
  7 |   "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
  8 |   "save": "./20B_checkpoints",
  9 |   "load": "~/slim_weights",
 10 | 
 11 |   # If finetuning, edit the following to the location of your finetuning dataset:
 12 |   "data-path": "./data/train_data/passage_crits_text_document",
 13 | 
 14 |   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 15 |   # across the node boundaries )
 16 |   "pipe-parallel-size": 4,
 17 |   "model-parallel-size": 2,
 18 | 
 19 |   # model settings
 20 |   "num-layers": 44,
 21 |   "hidden-size": 6144,
 22 |   "num-attention-heads": 64,
 23 |   "seq-length": 2048,
 24 |   "max-position-embeddings": 2048,
 25 |   "norm": "layernorm",
 26 |   "pos-emb": "rotary",
 27 |   "rotary_pct": 0.25,
 28 |   "no-weight-tying": true,
 29 |   "gpt_j_residual": true,
 30 |   "output_layer_parallelism": "column",
 31 |   "scaled-upper-triang-masked-softmax-fusion": true,
 32 |   "bias-gelu-fusion": true,
 33 | 
 34 |   # init methods
 35 |   "init_method": "small_init",
 36 |   "output_layer_init_method": "wang_init",
 37 | 
 38 |   # optimizer settings
 39 |   "optimizer": {
 40 |     "type": "Adam",
 41 |     "params": {
 42 |       "lr": 0.97e-4,
 43 |       "betas": [0.9, 0.95],
 44 |       "eps": 1.0e-8,
 45 |       }
 46 |       },
 47 | 
 48 |   "min_lr": 0.97e-5,
 49 |   "zero_optimization": {
 50 |   "stage": 1,
 51 |   "allgather_partitions": True,
 52 |   "allgather_bucket_size": 1260000000,
 53 |   "overlap_comm": True,
 54 |   "reduce_scatter": True,
 55 |   "reduce_bucket_size": 1260000000,
 56 |   "contiguous_gradients": True,
 57 |   "cpu_offload": False
 58 |   },
 59 | 
 60 |   # batch / data settings (assuming 96 GPUs)
 61 |   "train_micro_batch_size_per_gpu": 4,
 62 |   "gradient_accumulation_steps": 32,
 63 |   "data-impl": "mmap",
 64 |   "split": "995,4,1",
 65 | 
 66 |   # activation checkpointing
 67 |   "checkpoint-activations": true,
 68 |   "checkpoint-num-layers": 1,
 69 |   "partition-activations": false,
 70 |   "synchronize-each-layer": true,
 71 | 
 72 |   # regularization
 73 |   "gradient_clipping": 1.0,
 74 |   "weight-decay": 0.01,
 75 |   "hidden-dropout": 0,
 76 |   "attention-dropout": 0,
 77 | 
 78 |   # precision settings
 79 |   "fp16": {
 80 |     "fp16": true,
 81 |     "enabled": true,
 82 |     "loss_scale": 0,
 83 |     "loss_scale_window": 1000,
 84 |     "initial_scale_power": 12,
 85 |     "hysteresis": 2,
 86 |     "min_loss_scale": 1
 87 |     },
 88 | 
 89 |   # misc. training settings
 90 |   "train-iters": 150000,
 91 |   "lr-decay-iters": 150000,
 92 | 
 93 |   "distributed-backend": "nccl",
 94 |   "lr-decay-style": "cosine",
 95 |   "warmup": 0.01,
 96 |   "save-interval": 500,
 97 |   "eval-interval": 1000,
 98 |   "eval-iters": 10,
 99 | 
100 |   # logging
101 |   "log-interval": 2,
102 |   "steps_per_print": 2,
103 |   "wall_clock_breakdown": false,
104 | 
105 |   ### NEW DATA: ####
106 |   "tokenizer_type": "HFTokenizer",
107 |   "tensorboard-dir": "./tensorboard",
108 |   "log-dir": "./logs",
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/configs/20B.yml:
--------------------------------------------------------------------------------
  1 | # DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
  2 | # GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
  3 | # the model in memory.
  4 | 
  5 | {
  6 |   # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
  7 |   "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
  8 |   "save": "./20B_checkpoints",
  9 |   "load": "~/slim_weights",
 10 | 
 11 |   # If finetuning, edit the following to the location of your finetuning dataset:
 12 |   "data-path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
 13 | 
 14 |   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 15 |   # across the node boundaries )
 16 |   "pipe-parallel-size": 4,
 17 |   "model-parallel-size": 2,
 18 | 
 19 |   # model settings
 20 |   "num-layers": 44,
 21 |   "hidden-size": 6144,
 22 |   "num-attention-heads": 64,
 23 |   "seq-length": 2048,
 24 |   "max-position-embeddings": 2048,
 25 |   "norm": "layernorm",
 26 |   "pos-emb": "rotary",
 27 |   "rotary_pct": 0.25,
 28 |   "no-weight-tying": true,
 29 |   "gpt_j_residual": true,
 30 |   "output_layer_parallelism": "column",
 31 |   "scaled-upper-triang-masked-softmax-fusion": true,
 32 |   "bias-gelu-fusion": true,
 33 | 
 34 |   # init methods
 35 |   "init_method": "small_init",
 36 |   "output_layer_init_method": "wang_init",
 37 | 
 38 |   # optimizer settings
 39 |   "optimizer": {
 40 |     "type": "Adam",
 41 |     "params": {
 42 |       "lr": 0.97e-4,
 43 |       "betas": [0.9, 0.95],
 44 |       "eps": 1.0e-8,
 45 |       }
 46 |       },
 47 | 
 48 |   "min_lr": 0.97e-5,
 49 |   "zero_optimization": {
 50 |   "stage": 1,
 51 |   "allgather_partitions": True,
 52 |   "allgather_bucket_size": 1260000000,
 53 |   "overlap_comm": True,
 54 |   "reduce_scatter": True,
 55 |   "reduce_bucket_size": 1260000000,
 56 |   "contiguous_gradients": True,
 57 |   "cpu_offload": False
 58 |   },
 59 | 
 60 |   # batch / data settings (assuming 96 GPUs)
 61 |   "train_micro_batch_size_per_gpu": 4,
 62 |   "gradient_accumulation_steps": 32,
 63 |   "data-impl": "mmap",
 64 |   "split": "995,4,1",
 65 | 
 66 |   # activation checkpointing
 67 |   "checkpoint-activations": true,
 68 |   "checkpoint-num-layers": 1,
 69 |   "partition-activations": false,
 70 |   "synchronize-each-layer": true,
 71 | 
 72 |   # regularization
 73 |   "gradient_clipping": 1.0,
 74 |   "weight-decay": 0.01,
 75 |   "hidden-dropout": 0,
 76 |   "attention-dropout": 0,
 77 | 
 78 |   # precision settings
 79 |   "fp16": {
 80 |     "fp16": true,
 81 |     "enabled": true,
 82 |     "loss_scale": 0,
 83 |     "loss_scale_window": 1000,
 84 |     "initial_scale_power": 12,
 85 |     "hysteresis": 2,
 86 |     "min_loss_scale": 1
 87 |     },
 88 | 
 89 |   # misc. training settings
 90 |   "train-iters": 150000,
 91 |   "lr-decay-iters": 150000,
 92 | 
 93 |   "distributed-backend": "nccl",
 94 |   "lr-decay-style": "cosine",
 95 |   "warmup": 0.01,
 96 |   "save-interval": 500,
 97 |   "eval-interval": 1000,
 98 |   "eval-iters": 10,
 99 | 
100 |   # logging
101 |   "log-interval": 2,
102 |   "steps_per_print": 2,
103 |   "wall_clock_breakdown": false,
104 | 
105 |   ### NEW DATA: ####
106 |   "tokenizer_type": "HFTokenizer",
107 |   "tensorboard-dir": "./tensorboard",
108 |   "log-dir": "./logs",
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/configs/gen_docs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | sys.path.append(
 5 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
 6 | )
 7 | from megatron.neox_arguments import neox_args, deepspeed_args
 8 | from inspect import getmembers, getsource
 9 | from dataclasses import field, is_dataclass
10 | from itertools import tee, zip_longest
11 | import pathlib
12 | 
13 | 
14 | def pairwise(iterable):
15 |     "s -> (s0,s1), (s1,s2), (s2, s3), ..."
16 |     a, b = tee(iterable)
17 |     next(b, None)
18 |     return zip_longest(a, b)
19 | 
20 | 
21 | def get_docs(module):
22 |     ARGS_CLASSES = getmembers(module, is_dataclass)
23 |     results = {}
24 |     for name, dcls in ARGS_CLASSES:
25 |         assert is_dataclass(dcls)
26 |         src = getsource(dcls)
27 |         d = dcls()
28 |         loc = 0
29 |         results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
30 |         for cur, _next in pairwise(d.__dataclass_fields__.items()):
31 |             field_name, field_def = cur
32 |             field_type = field_def.type
33 |             if hasattr(field_type, "__name__"):
34 |                 field_type = field_type.__name__
35 |             else:
36 |                 field_type = str(field_type)
37 | 
38 |             field_default = field_def.default
39 | 
40 |             # try to find the field definition
41 |             loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
42 | 
43 |             if _next is not None:
44 |                 next_field_name, _ = _next
45 |                 # try to find the next field definition
46 |                 next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
47 |             else:
48 |                 next_loc = len(src)
49 | 
50 |             # try to get the docstring
51 |             _src = src[loc:next_loc].strip()
52 |             if '"""' in _src:
53 |                 doc = _src.split('"""')[1].strip()
54 |             elif "'''" in _src:
55 |                 doc = _src.split("'''")[1].strip()
56 |             else:
57 |                 doc = ""
58 |             results[name]["attributes"][field_name] = {
59 |                 "name": field_name,
60 |                 "type": field_type,
61 |                 "default": field_default,
62 |                 "doc": doc,
63 |             }
64 |     return results
65 | 
66 | 
67 | def to_md(docs, intro_str=""):
68 |     """
69 |     Writes the docs dictionary to markdown format
70 |     """
71 |     lines = []
72 |     lines.append(intro_str)
73 |     for name, doc in docs.items():
74 |         lines.append(f"## {name}")
75 |         lines.append(f"{doc['doc']}")
76 |         lines.append("")
77 |         for field_name, field_def in doc["attributes"].items():
78 |             # attribute name and type
79 |             lines.append(f"- **{field_name}**: {field_def['type']}")
80 |             # default value
81 |             lines.append(f"    Default = {str(field_def['default'])}")
82 |             lines.append(f"    {field_def['doc']}")
83 |             lines.append("")
84 |     return "\n\n".join(lines)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     docs = get_docs(neox_args)
89 |     docs.update(get_docs(deepspeed_args))
90 |     intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
91 |     md = to_md(docs, intro_str=intro_str)
92 |     with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
93 |         f.write(md)
94 | 


--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) 2021 Josh Levy-Kramer <josh@levykramer.co.uk>. All rights reserved.
 3 | # This file is based on code by the authors denoted below and has been modified from its original version.
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from megatron.utils import print_rank_0, setup_for_inference_or_eval
19 | 
20 | from megatron.text_generation_utils import (
21 |     generate_samples_input_from_file,
22 |     generate_samples_from_prompt,
23 |     generate_samples_unconditional,
24 |     generate_samples_interactive,
25 | )
26 | 
27 | 
28 | def main():
29 |     """
30 |     Generate text/sample model
31 |     """
32 |     model, neox_args = setup_for_inference_or_eval(use_cache=True)
33 |     if neox_args.recompute:
34 |         model.module.inference_mode(
35 |             use_cache=False
36 |         )  # don't use kv cache if recomputing
37 |     if neox_args.text_gen_type == "unconditional":
38 |         print_rank_0(
39 |             f"Generating samples unconditionally and saving results to {neox_args.sample_output_file}"
40 |         )
41 |         generate_samples_unconditional(
42 |             neox_args=neox_args,
43 |             model=model,
44 |             number_of_samples=neox_args.num_samples,
45 |             output_file=neox_args.sample_output_file,
46 |             maximum_tokens=neox_args.maximum_tokens,
47 |             recompute=neox_args.recompute,
48 |             temperature=neox_args.temperature,
49 |             top_k=neox_args.top_k,
50 |             top_p=neox_args.top_p,
51 |         )
52 | 
53 |     elif neox_args.text_gen_type == "input-file":
54 |         print_rank_0(
55 |             f"Generating samples from input file {neox_args.sample_input_file}"
56 |         )
57 |         assert neox_args.sample_input_file is not None
58 |         generate_samples_input_from_file(
59 |             neox_args=neox_args,
60 |             model=model,
61 |             input_file=neox_args.sample_input_file,
62 |             output_file=neox_args.sample_output_file,
63 |             maximum_tokens=neox_args.maximum_tokens,
64 |             recompute=neox_args.recompute,
65 |             temperature=neox_args.temperature,
66 |             top_k=neox_args.top_k,
67 |             top_p=neox_args.top_p,
68 |         )
69 | 
70 |     elif neox_args.text_gen_type == "interactive":
71 |         generate_samples_interactive(
72 |             neox_args=neox_args,
73 |             model=model,
74 |             recompute=neox_args.recompute,
75 |             temperature=neox_args.temperature,
76 |             maximum_tokens=neox_args.maximum_tokens,
77 |             top_k=neox_args.top_k,
78 |             top_p=neox_args.top_p,
79 |         )
80 | 
81 |     else:
82 |         raise ValueError(
83 |             f"`text-gen-type` either not specified or not recognised: {neox_args.text_gen_type}"
84 |         )
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/tests/model/test_model_instantiation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | instantiate models with different configurations as a first possible point of failure
  3 | """
  4 | 
  5 | import pytest
  6 | 
  7 | import torch
  8 | import os
  9 | from ..common import distributed_test, model_setup, clear_test_dirs, parametrize, binary
 10 | 
 11 | PARAMS_TO_TEST = {
 12 |     "pipe_parallel_size,model_parallel_size,world_size": [
 13 |         [0, 1, 1],
 14 |         [1, 2, 2],
 15 |         [0, 2, 2],
 16 |     ],
 17 |     "no_weight_tying": binary,
 18 |     "attention_config": [
 19 |         [[["global"], "all"]],
 20 |         [[["local"], "all"]],
 21 |         [[["sparse_variable"], "all"]],
 22 |         [[["sparse_fixed"], "all"]],
 23 |     ],
 24 |     "scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [
 25 |         [True, False],
 26 |         [False, True],
 27 |     ],
 28 |     "fp16,fp32_allreduce": [
 29 |         [
 30 |             {
 31 |                 "enabled": True,
 32 |                 "type": "bfloat16",
 33 |                 "loss_scale": 0,
 34 |                 "loss_scale_window": 1000,
 35 |                 "hysteresis": 2,
 36 |                 "min_loss_scale": 1,
 37 |             },
 38 |             True,
 39 |         ],
 40 |         [
 41 |             {
 42 |                 "enabled": True,
 43 |                 "loss_scale": 0,
 44 |                 "loss_scale_window": 1000,
 45 |                 "hysteresis": 2,
 46 |                 "min_loss_scale": 1,
 47 |             },
 48 |             False,
 49 |         ],
 50 |     ],
 51 | }
 52 | 
 53 | parameters, names = parametrize(
 54 |     PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 55 | )
 56 | 
 57 | 
 58 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
 59 | def test_instantiate(param_dict):
 60 |     @distributed_test(world_size=param_dict.pop("world_size", 2))
 61 |     def wrapper():
 62 |         run_test_model_instantiation(param_dict=param_dict)
 63 | 
 64 |     wrapper()
 65 | 
 66 | 
 67 | OPTIMIZER_PARAMS = {
 68 |     "optimizer": [
 69 |         {"type": "adam", "params": {"lr": 0.0006}},
 70 |         {"type": "onebitadam", "params": {"lr": 0.0006}},
 71 |         {"type": "cpu_adam", "params": {"lr": 0.0006}},
 72 |         {"type": "cpu_torch_adam", "params": {"lr": 0.0006}},
 73 |         {"type": "sm3", "params": {"lr": 0.0006}},
 74 |         {"type": "madgrad_wd", "params": {"lr": 0.0006}},
 75 |     ]
 76 | }
 77 | opt_params, opt_name = parametrize(
 78 |     OPTIMIZER_PARAMS, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 79 | )
 80 | 
 81 | 
 82 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
 83 | def test_instantiate_optimizers(param_dict):
 84 |     @distributed_test(world_size=2)
 85 |     def wrapper():
 86 |         run_test_model_instantiation(param_dict=param_dict)
 87 | 
 88 |     wrapper()
 89 | 
 90 | 
 91 | def run_test_model_instantiation(yaml_list=None, param_dict=None):
 92 |     from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine
 93 | 
 94 |     model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
 95 |     if args_loaded.pipe_parallel_size < 2:
 96 |         assert isinstance(model, DeepSpeedEngine), "test model instantiation " + str(
 97 |             yaml_list
 98 |         )
 99 |     else:
100 |         assert isinstance(model, PipelineEngine), "test model instantiation " + str(
101 |             yaml_list
102 |         )
103 |     if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
104 |         clear_test_dirs()
105 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor);
26 | 
27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
28 |                        torch::Tensor const& softmax_results,
29 |                        float scale_factor);
30 | 
31 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads);
32 | 
33 | torch::Tensor fwd(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor)
34 | {
35 |     AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
36 |     AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
37 |                    (input.scalar_type() == at::ScalarType::BFloat16),
38 |                "Only fp16 and bf16 are supported");
39 |     AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
40 | 
41 |     return fwd_cuda(input, mask, scale_factor);
42 | }
43 | 
44 | torch::Tensor bwd(torch::Tensor const& output_grads,
45 |                   torch::Tensor const& softmax_results,
46 |                   float scale_factor)
47 | {
48 |     AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
49 |     AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
50 | 
51 |     AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
52 |                    (output_grads.scalar_type() == at::ScalarType::BFloat16),
53 |                "Only fp16 and bf16 are supported");
54 |     AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
55 |                    (softmax_results.scalar_type() == at::ScalarType::BFloat16),
56 |                "Only fp16 and bf16 are supported");
57 | 
58 |     return bwd_cuda(output_grads, softmax_results, scale_factor);
59 | }
60 | 
61 | int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads)
62 | {
63 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
64 | }
65 | 
66 | }  // end namespace scaled_masked_softmax
67 | }  // end namespace fused_softmax
68 | }  // end namespace multihead_attn
69 | 
70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
71 | {
72 |     m.def("forward",
73 |           &multihead_attn::fused_softmax::scaled_masked_softmax::fwd,
74 |           "Self Multihead Attention scaled, time masked softmax -- Forward.");
75 | 
76 |     m.def("backward",
77 |           &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
78 |           "Self Multihead Attention scaled, time masked softmax -- Backward.");
79 | 
80 |     m.def("get_batch_per_block",
81 |           &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
82 |           "Return Batch per block size.");
83 | }
84 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include <ATen/cuda/CUDAContext.h>
19 | #include <cuda.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_profiler_api.h>
22 | #include <cuda_runtime.h>
23 | #include <torch/extension.h>
24 | #include "scaled_upper_triang_masked_softmax.h"
25 | #include "type_shim.h"
26 | 
27 | namespace multihead_attn {
28 | namespace fused_softmax {
29 | namespace scaled_upper_triang_masked_softmax {
30 | 
31 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor)
32 | {
33 |     // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
34 |     const int attn_batches = input.size(0);
35 |     const int seq_len = input.size(1);
36 |     TORCH_INTERNAL_ASSERT(seq_len <= 2048);
37 | 
38 |     // Output
39 |     auto act_options = input.options().requires_grad(false);
40 |     torch::Tensor softmax_results = torch::empty({attn_batches, seq_len, seq_len}, act_options);
41 | 
42 |     // Softmax Intermediate Result Ptr
43 |     void* input_ptr = static_cast<void*>(input.data_ptr());
44 |     void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
45 | 
46 |     DISPATCH_HALF_AND_BFLOAT(
47 |         input.scalar_type(),
48 |         "dispatch_scaled_upper_triang_masked_softmax_forward",
49 |         dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
50 |             reinterpret_cast<scalar_t*>(softmax_results_ptr),
51 |             reinterpret_cast<const scalar_t*>(input_ptr),
52 |             scale_factor,
53 |             seq_len,
54 |             seq_len,
55 |             attn_batches););
56 |     return softmax_results;
57 | }
58 | 
59 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads_,
60 |                        torch::Tensor const& softmax_results_,
61 |                        float scale_factor)
62 | {
63 |     auto output_grads = output_grads_.contiguous();
64 |     auto softmax_results = softmax_results_.contiguous();
65 | 
66 |     // output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
67 |     const int attn_batches = output_grads.size(0);
68 |     const int seq_len = output_grads.size(1);
69 |     TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
70 | 
71 |     void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
72 | 
73 |     // Softmax Grad
74 |     DISPATCH_HALF_AND_BFLOAT(
75 |         output_grads_.scalar_type(),
76 |         "dispatch_scaled_upper_triang_masked_softmax_backward",
77 |         dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
78 |             reinterpret_cast<scalar_t*>(output_grads_ptr),
79 |             reinterpret_cast<scalar_t*>(output_grads_ptr),
80 |             reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
81 |             scale_factor,
82 |             seq_len,
83 |             seq_len,
84 |             attn_batches););
85 | 
86 |     // backward pass is completely in-place
87 |     return output_grads;
88 | }
89 | }  // namespace scaled_upper_triang_masked_softmax
90 | }  // namespace fused_softmax
91 | }  // namespace multihead_attn
92 | 


--------------------------------------------------------------------------------
/tests/model/test_model_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | instantiate models, save checkpoints, load checkpoints, compare loaded parameters to saved parameters and compare forward pass outputs
  3 | 
  4 | This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs
  5 | to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures.
  6 | """
  7 | 
  8 | 
  9 | import os
 10 | 
 11 | if __name__ == "__main__":
 12 |     import sys
 13 | 
 14 |     sys.path.append(os.path.abspath(""))
 15 | 
 16 | import pytest
 17 | from tests.common import distributed_test, model_setup, parametrize, dict_repr
 18 | import torch
 19 | 
 20 | PARAMS_TO_TEST = {
 21 |     "pipe_parallel_size,model_parallel_size,world_size": [
 22 |         [0, 1, 1],
 23 |         [0, 1, 2],
 24 |         [1, 2, 2],
 25 |         [0, 2, 2],
 26 |         [2, 1, 2],
 27 |     ],
 28 |     "top_p,temperature,top_k": [[0.0, 0.5, 0], [0.5, 0.0, 100], [0.5, 0.5, 0]],
 29 |     "prompt": ["", "hello world"],
 30 |     "fp16,fp32_allreduce": [
 31 |         [
 32 |             {
 33 |                 "enabled": True,
 34 |                 "type": "bfloat16",
 35 |                 "loss_scale": 0,
 36 |                 "loss_scale_window": 1000,
 37 |                 "hysteresis": 2,
 38 |                 "min_loss_scale": 1,
 39 |             },
 40 |             True,
 41 |         ],
 42 |         [
 43 |             {
 44 |                 "enabled": True,
 45 |                 "loss_scale": 0,
 46 |                 "loss_scale_window": 1000,
 47 |                 "hysteresis": 2,
 48 |                 "min_loss_scale": 1,
 49 |             },
 50 |             False,
 51 |         ],
 52 |     ],
 53 | }
 54 | 
 55 | parameters, names = parametrize(
 56 |     PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 57 | )
 58 | 
 59 | 
 60 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
 61 | def test_train(param_dict):
 62 |     @distributed_test(world_size=param_dict.pop("world_size", 2))
 63 |     def wrapper():
 64 |         run_generate_test(param_dict=param_dict, prompt=param_dict.pop("prompt"))
 65 | 
 66 |     wrapper()
 67 | 
 68 | 
 69 | def run_generate_test(param_dict, prompt):
 70 |     from megatron.text_generation_utils import generate_samples_from_prompt
 71 |     from megatron.utils import is_mp_rank_0
 72 | 
 73 |     fixed_params = {
 74 |         "num_samples": 3,
 75 |         "maximum_tokens": 50,
 76 |         "make_vocab_size_divisible_by": 2,
 77 |         "sample_output_file": "test_sample_output.txt",
 78 |         "checkpoint_activations": False,
 79 |         "partition_activations": False,
 80 |         "no_load_optim": True,
 81 |     }
 82 | 
 83 |     param_dict.update(fixed_params)
 84 |     # TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this
 85 |     model, _, _, args_loaded = model_setup(
 86 |         None, param_dict, clear_data=True, inference=True
 87 |     )
 88 |     model.eval()
 89 | 
 90 |     prompts = [prompt for _ in range(args_loaded.num_samples)]
 91 |     output = generate_samples_from_prompt(
 92 |         neox_args=args_loaded,
 93 |         model=model,
 94 |         text=prompts,
 95 |         maximum_tokens=args_loaded.maximum_tokens,
 96 |         recompute=False,
 97 |         temperature=args_loaded.temperature,
 98 |         top_k=args_loaded.top_k,
 99 |         top_p=args_loaded.top_p,
100 |     )
101 | 
102 |     # outputs only get generated on mp rank 0
103 |     if is_mp_rank_0():
104 |         assert len(output) == len(prompts)
105 |         for prompt, out in zip(prompts, output):
106 |             assert prompt == out["context"]
107 |             assert len(out["text"]) > 0
108 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/train_tokenizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Assumes a dataset of jsonl files in the same format as the neox training set.
  3 | """
  4 | 
  5 | from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
  6 | from tokenizers.normalizers import NFKC
  7 | 
  8 | from glob import glob
  9 | import os
 10 | import json
 11 | import argparse
 12 | 
 13 | 
 14 | def load_jsonl(input_path, quiet=True) -> list:
 15 |     """
 16 |     Read list of objects from a JSON lines file.
 17 |     """
 18 |     data = []
 19 |     with open(input_path, "r", encoding="utf-8") as f:
 20 |         for line in f:
 21 |             data.append(json.loads(line.rstrip("\n|\r")))
 22 |     if not quiet:
 23 |         print("Loaded {} records from {}".format(len(data), input_path))
 24 |     return data
 25 | 
 26 | 
 27 | def json_iterator(input_dir, text_key="text"):
 28 |     all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
 29 |     for j in all_jsonls:
 30 |         data = load_jsonl(j)
 31 |         for doc in data:
 32 |             yield doc[text_key]
 33 | 
 34 | 
 35 | def train_tokenizer(
 36 |     input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
 37 | ):
 38 |     """
 39 |     Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
 40 | 
 41 |     :param input_dir: input directory containing jsonl files
 42 |     :param save_path: path to save tokenizer to
 43 |     :param tokenizer_type: type of tokenizer to train.
 44 |     :param vocab_size: int, size of tokenizer's vocab
 45 |     :return:
 46 |     """
 47 | 
 48 |     if tokenizer_type == "BPE":
 49 |         model = models.BPE()
 50 |     else:
 51 |         raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
 52 |     tokenizer = Tokenizer(model)
 53 | 
 54 |     # Customize pre-tokenization and decoding
 55 |     tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
 56 |     tokenizer.decoder = decoders.ByteLevel()
 57 |     tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
 58 |     tokenizer.normalizer = NFKC()
 59 | 
 60 |     # And then train
 61 |     trainer = trainers.BpeTrainer(
 62 |         vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
 63 |     )
 64 |     tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
 65 | 
 66 |     # And Save it
 67 |     tokenizer.save(save_path, pretty=True)
 68 |     print(f"Tokenizer saved at {save_path}")
 69 | 
 70 | 
 71 | def parse_args():
 72 |     parser = argparse.ArgumentParser(
 73 |         description="script for training a multilingual "
 74 |         "HF tokenizer on CC dumps with upweighting for low resource languages"
 75 |     )
 76 |     parser.add_argument(
 77 |         "--json_input_dir",
 78 |         type=str,
 79 |         help="Path to folder containing tokenizer training data in jsonl format",
 80 |     )
 81 |     parser.add_argument(
 82 |         "--tokenizer_output_path",
 83 |         type=str,
 84 |         help="Path to which your trained tokenizer will be saved (should end in .json)",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--tokenizer_type",
 88 |         type=str,
 89 |         help="type of tokenizer to train, currently only BPE is supported",
 90 |         choices=["BPE"],
 91 |         default=["BPE"],
 92 |     )
 93 |     parser.add_argument(
 94 |         "-v",
 95 |         "--vocab_size",
 96 |         help="vocabulary size of tokenizer, default=52k",
 97 |         type=int,
 98 |         default=52000,
 99 |     )
100 |     return parser.parse_args()
101 | 
102 | 
103 | if __name__ == "__main__":
104 | 
105 |     args = parse_args()
106 | 
107 |     train_tokenizer(
108 |         args.json_input_dir,
109 |         save_path=args.tokenizer_output_path,
110 |         tokenizer_type=args.tokenizer_type,
111 |         vocab_size=args.vocab_size,
112 |     )
113 | 


--------------------------------------------------------------------------------
/tests/model/test_model_checkpoint.py:
--------------------------------------------------------------------------------
  1 | """
  2 | instantiate models, save checkpoints, load checkpoints, compare loaded parameters to saved parameters and compare forward pass outputs
  3 | 
  4 | This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs
  5 | to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures.
  6 | """
  7 | import os
  8 | 
  9 | if __name__ == "__main__":
 10 |     import sys
 11 | 
 12 |     sys.path.append(os.path.abspath(""))
 13 | 
 14 | import pytest
 15 | from tests.common import (
 16 |     distributed_test,
 17 |     clear_test_dirs,
 18 |     model_setup,
 19 |     binary,
 20 |     parametrize,
 21 | )
 22 | import torch
 23 | 
 24 | PARAMS_TO_TEST = {
 25 |     "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2], [2, 1]],
 26 |     "checkpoint_validation_with_forward_pass": [True],
 27 |     "fp16,fp32_allreduce": [
 28 |         [
 29 |             {
 30 |                 "enabled": True,
 31 |                 "type": "bfloat16",
 32 |                 "loss_scale": 0,
 33 |                 "loss_scale_window": 1000,
 34 |                 "hysteresis": 2,
 35 |                 "min_loss_scale": 1,
 36 |             },
 37 |             True,
 38 |         ],
 39 |         [
 40 |             {
 41 |                 "enabled": True,
 42 |                 "loss_scale": 0,
 43 |                 "loss_scale_window": 1000,
 44 |                 "hysteresis": 2,
 45 |                 "min_loss_scale": 1,
 46 |             },
 47 |             False,
 48 |         ],
 49 |     ],
 50 | }
 51 | 
 52 | parameters, names = parametrize(
 53 |     PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 54 | )
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
 58 | def test_train(param_dict):
 59 |     @distributed_test(world_size=2)
 60 |     def wrapper():
 61 |         run_checkpoint_test(param_dict=param_dict)
 62 | 
 63 |     wrapper()
 64 | 
 65 | 
 66 | def run_checkpoint_test(yaml_list=None, param_dict=None):
 67 | 
 68 |     from megatron.checkpointing import load_checkpoint
 69 |     from megatron.checkpointing import save_checkpoint
 70 | 
 71 |     model, optimizer, lr_scheduler, args_loaded = model_setup(
 72 |         yaml_list, param_dict, clear_data=True
 73 |     )
 74 | 
 75 |     # save model checkpoint
 76 |     save_checkpoint(
 77 |         neox_args=args_loaded,
 78 |         iteration=42,
 79 |         model=model,
 80 |         optimizer=optimizer,
 81 |         lr_scheduler=lr_scheduler,
 82 |     )
 83 | 
 84 |     # reload model from checkpoint
 85 |     (
 86 |         reloaded_model,
 87 |         reloaded_optimizer,
 88 |         reloaded_lr_scheduler,
 89 |         args_reloaded,
 90 |     ) = model_setup(yaml_list, param_dict, clear_data=False)
 91 |     iteration = load_checkpoint(
 92 |         neox_args=args_reloaded,
 93 |         model=reloaded_model,
 94 |         optimizer=reloaded_optimizer,
 95 |         lr_scheduler=reloaded_lr_scheduler,
 96 |     )
 97 | 
 98 |     # ensure same checkpoint is loaded
 99 |     assert (
100 |         iteration == 42
101 |     ), "run_checkpoint_test() iteration loaded from checkpoint correct"
102 | 
103 |     # check all weight groups are the same
104 |     for idx, ((n1, p1), (n2, p2)) in enumerate(
105 |         zip(
106 |             list(model.module.named_parameters()),
107 |             list(reloaded_model.module.named_parameters()),
108 |         )
109 |     ):
110 |         assert n1 == n2
111 |         params_equal = (p1 == p2).all().item()
112 |         assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
113 | 
114 |     if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
115 |         clear_test_dirs()
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     params = list(
120 |         parametrize(
121 |             PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
122 |         )
123 |     )
124 |     test_train(params[0])
125 | 


--------------------------------------------------------------------------------
/megatron/mpu/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | 
 17 | from .initialize import get_model_parallel_group
 18 | from .initialize import get_model_parallel_rank
 19 | from .initialize import get_model_parallel_src_rank
 20 | 
 21 | 
 22 | _MAX_DATA_DIM = 4
 23 | 
 24 | 
 25 | def _check_data_types(keys, data, target_dtype):
 26 |     """Check that all the keys have the same target data type."""
 27 |     for key in keys:
 28 |         assert (
 29 |             data[key].dtype == target_dtype
 30 |         ), "{} has data type {} which " "is different than {}".format(
 31 |             key, data[key].dtype, target_dtype
 32 |         )
 33 | 
 34 | 
 35 | def _build_key_size_numel_dictionaries(keys, data):
 36 |     """Build the size on rank 0 and broadcast."""
 37 |     max_dim = _MAX_DATA_DIM
 38 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 39 | 
 40 |     # Pack the sizes on rank zero.
 41 |     if get_model_parallel_rank() == 0:
 42 |         offset = 0
 43 |         for key in keys:
 44 |             assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
 45 |             size = data[key].size()
 46 |             for i, s in enumerate(size):
 47 |                 sizes[i + offset] = s
 48 |             offset += max_dim
 49 | 
 50 |     # Move to GPU and broadcast.
 51 |     sizes_cuda = torch.cuda.LongTensor(sizes)
 52 |     torch.distributed.broadcast(
 53 |         sizes_cuda, get_model_parallel_src_rank(), group=get_model_parallel_group()
 54 |     )
 55 | 
 56 |     # Move back to cpu and unpack.
 57 |     sizes_cpu = sizes_cuda.cpu()
 58 |     key_size = {}
 59 |     key_numel = {}
 60 |     total_numel = 0
 61 |     offset = 0
 62 |     for key in keys:
 63 |         i = 0
 64 |         size = []
 65 |         numel = 1
 66 |         while sizes_cpu[offset + i] > 0:
 67 |             this_size = sizes_cpu[offset + i]
 68 |             size.append(this_size)
 69 |             numel *= this_size
 70 |             i += 1
 71 |         key_size[key] = size
 72 |         key_numel[key] = numel
 73 |         total_numel += numel
 74 |         offset += max_dim
 75 | 
 76 |     return key_size, key_numel, total_numel
 77 | 
 78 | 
 79 | def broadcast_data(keys, data, datatype):
 80 |     """Broadcast data from rank zero of each model parallel group to the
 81 |     members of the same model parallel group.
 82 | 
 83 |     Arguments:
 84 |         keys: list of keys in the data disctionary to be broadcasted
 85 |         data: data dictionary of string keys and cpu tensor values.
 86 |         datatype: torch data type of all tensors in data associated
 87 |                   with keys.
 88 |     """
 89 |     # Build (key, size) and (key, number of elements) dictionaries along
 90 |     # with the total number of elements on all ranks.
 91 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
 92 | 
 93 |     # Pack on rank zero.
 94 |     if get_model_parallel_rank() == 0:
 95 |         # Check that all keys have the same data type.
 96 |         _check_data_types(keys, data, datatype)
 97 |         # Flatten the data associated with the keys
 98 |         flatten_data = torch.cat(
 99 |             [data[key].contiguous().view(-1) for key in keys], dim=0
100 |         ).cuda()
101 |     else:
102 |         flatten_data = torch.empty(
103 |             total_numel, device=torch.cuda.current_device(), dtype=datatype
104 |         )
105 | 
106 |     # Boradcast
107 |     torch.distributed.broadcast(
108 |         flatten_data, get_model_parallel_src_rank(), group=get_model_parallel_group()
109 |     )
110 | 
111 |     # Unpack
112 |     output = {}
113 |     offset = 0
114 |     for key in keys:
115 |         size = key_size[key]
116 |         numel = key_numel[key]
117 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
118 |         offset += numel
119 | 
120 |     return output
121 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | #### System package (uses default Python 3 version in Ubuntu 20.04)
 6 | RUN apt-get update -y && \
 7 |     apt-get install -y \
 8 |         git python3 python3-dev libpython3-dev python3-pip sudo pdsh \
 9 |         htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \
10 |         nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
11 |         rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \
12 |         rdmacm-utils perftest rdma-core nano && \
13 |     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
14 |     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
15 |     pip install --upgrade pip && \
16 |     pip install gpustat
17 | 
18 | ### SSH
19 | # Set password
20 | RUN echo 'password' >> password.txt && \
21 |     mkdir /var/run/sshd && \
22 |     echo "root:`cat password.txt`" | chpasswd && \
23 |     # Allow root login with password
24 |     sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
25 |     # Prevent user being kicked off after login
26 |     sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \
27 |     echo 'AuthorizedKeysFile     .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
28 |     echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \
29 |     # FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42
30 |     echo "Set disable_coredump false" >> /etc/sudo.conf && \
31 |     # Clean up
32 |     rm password.txt
33 | 
34 | # Expose SSH port
35 | EXPOSE 22
36 | 
37 | #### OPENMPI
38 | ENV OPENMPI_BASEVERSION=4.1
39 | ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0
40 | RUN mkdir -p /build && \
41 |     cd /build && \
42 |     wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
43 |     cd openmpi-${OPENMPI_VERSION} && \
44 |     ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
45 |     make -j"$(nproc)" install && \
46 |     ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
47 |     # Sanity check:
48 |     test -f /usr/local/mpi/bin/mpic++ && \
49 |     cd ~ && \
50 |     rm -rf /build
51 | 
52 | # Needs to be in docker PATH if compiling other items & bashrc PATH (later)
53 | ENV PATH=/usr/local/mpi/bin:${PATH} \
54 |     LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
55 | 
56 | # Create a wrapper for OpenMPI to allow running as root by default
57 | RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
58 |     echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
59 |     echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
60 |     chmod a+x /usr/local/mpi/bin/mpirun
61 | 
62 | #### User account
63 | RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \
64 |     usermod -aG sudo mchorse && \
65 |     echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
66 | 
67 | ## SSH config and bashrc
68 | RUN mkdir -p /home/mchorse/.ssh /job && \
69 |     echo 'Host *' > /home/mchorse/.ssh/config && \
70 |     echo '    StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \
71 |     echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \
72 |     echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \
73 |     echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \
74 |     echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
75 | 
76 | #### Python packages
77 | RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
78 | COPY requirements/requirements.txt .
79 | COPY requirements/requirements-onebitadam.txt .
80 | COPY requirements/requirements-sparseattention.txt .
81 | RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && pip install -r requirements-sparseattention.txt && pip cache purge
82 | 
83 | ## Install APEX
84 | RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
85 | 
86 | # Clear staging
87 | RUN mkdir -p /tmp && chmod 0777 /tmp
88 | 
89 | #### SWITCH TO mchorse USER
90 | USER mchorse
91 | WORKDIR /home/mchorse
92 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_commandline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | verify parsing and handover of command line arguments
  3 | """
  4 | import pytest
  5 | import sys
  6 | from unittest.mock import patch
  7 | 
  8 | from ..common import get_root_directory, get_config_directory, get_configs_with_path
  9 | 
 10 | 
 11 | @pytest.mark.cpu
 12 | def test_neoxargs_consume_deepy_args_with_config_dir():
 13 |     """
 14 |     verify consume_deepy_args processes command line arguments without config dir
 15 |     """
 16 | 
 17 |     from megatron.neox_arguments import NeoXArgs
 18 | 
 19 |     # load neox args with command line
 20 |     with patch(
 21 |         "sys.argv",
 22 |         [str(get_root_directory() / "deepy.py"), "pretrain_gpt2.py"]
 23 |         + get_configs_with_path(["small.yml", "local_setup.yml"]),
 24 |     ):
 25 |         args_loaded_consume = NeoXArgs.consume_deepy_args()
 26 | 
 27 |     # load neox args directly from yaml files
 28 |     args_loaded_yamls = NeoXArgs.from_ymls(
 29 |         get_configs_with_path(["small.yml", "local_setup.yml"])
 30 |     )
 31 | 
 32 |     # update values from yaml files that cannot otherwise be matched
 33 |     args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py")
 34 |     args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group
 35 | 
 36 |     assert args_loaded_yamls == args_loaded_consume
 37 | 
 38 | 
 39 | @pytest.mark.cpu
 40 | def test_neoxargs_consume_deepy_args_without_yml_suffix():
 41 |     """
 42 |     verify consume_deepy_args processes command line arguments without yaml suffix
 43 |     """
 44 | 
 45 |     from megatron.neox_arguments import NeoXArgs
 46 | 
 47 |     # load neox args with command line
 48 |     with patch(
 49 |         "sys.argv",
 50 |         [str(get_root_directory() / "deepy.py"), "pretrain_gpt2.py"]
 51 |         + get_configs_with_path(["small", "local_setup"]),
 52 |     ):
 53 |         args_loaded_consume = NeoXArgs.consume_deepy_args()
 54 | 
 55 |     # load neox args directly from yaml files
 56 |     args_loaded_yamls = NeoXArgs.from_ymls(
 57 |         get_configs_with_path(["small.yml", "local_setup.yml"])
 58 |     )
 59 | 
 60 |     # update values from yaml files that cannot otherwise be matched
 61 |     args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py")
 62 |     args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group
 63 | 
 64 |     assert args_loaded_yamls == args_loaded_consume
 65 | 
 66 | 
 67 | @pytest.mark.cpu
 68 | def test_neoxargs_consume_deepy_args_with_config_dir():
 69 |     """
 70 |     verify consume_deepy_args processes command line arguments including config dir
 71 |     """
 72 | 
 73 |     from megatron.neox_arguments import NeoXArgs
 74 | 
 75 |     # load neox args with command line
 76 |     with patch(
 77 |         "sys.argv",
 78 |         [
 79 |             str(get_root_directory() / "deepy.py"),
 80 |             "pretrain_gpt2.py",
 81 |             "-d",
 82 |             str(get_config_directory()),
 83 |         ]
 84 |         + ["small.yml", "local_setup.yml"],
 85 |     ):
 86 |         args_loaded_consume = NeoXArgs.consume_deepy_args()
 87 | 
 88 |     # load neox args directly from yaml files
 89 |     args_loaded_yamls = NeoXArgs.from_ymls(
 90 |         get_configs_with_path(["small.yml", "local_setup.yml"])
 91 |     )
 92 | 
 93 |     # update values from yaml files that cannot otherwise be matched
 94 |     args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py")
 95 |     args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group
 96 | 
 97 |     assert args_loaded_yamls == args_loaded_consume
 98 | 
 99 | 
100 | @pytest.mark.cpu
101 | def test_neoxargs_consume_neox_args():
102 |     """
103 |     verify megatron args are correctly consumed after sending via deepspeed
104 |     """
105 |     from megatron.neox_arguments import NeoXArgs
106 | 
107 |     # intitially load config from files as would be the case in deepy.py
108 |     yaml_list = get_configs_with_path(["small.yml", "local_setup.yml"])
109 |     args_baseline = NeoXArgs.from_ymls(yaml_list)
110 |     args_baseline.update_value(
111 |         "user_script", str(get_root_directory() / "pretrain_gpt2.py")
112 |     )
113 |     deepspeed_main_args = args_baseline.get_deepspeed_main_args()
114 | 
115 |     # patch sys.argv so that args can be access by set_global_variables within initialize_megatron
116 |     with patch("sys.argv", deepspeed_main_args):
117 |         args_loaded = NeoXArgs.consume_neox_args()
118 | 
119 |     # TODO is the wandb group really to be changed?
120 |     args_loaded.wandb_group = args_baseline.wandb_group
121 |     assert args_baseline.megatron_config == args_loaded.megatron_config
122 | 


--------------------------------------------------------------------------------
/megatron/model/init_functions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | def init_method_normal(sigma):
  7 |     """Init method based on N(0, sigma)."""
  8 | 
  9 |     def init_(tensor):
 10 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 11 | 
 12 |     return init_
 13 | 
 14 | 
 15 | def scaled_init_method_normal(sigma, num_layers):
 16 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
 17 |     std = sigma / math.sqrt(2.0 * num_layers)
 18 | 
 19 |     def init_(tensor):
 20 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
 21 | 
 22 |     return init_
 23 | 
 24 | 
 25 | # orthogonal init does not support fp16, so have to patch it
 26 | def _orthogonal(tensor, gain=1):
 27 |     if tensor.ndimension() < 2:
 28 |         raise ValueError("Only tensors with 2 or more dimensions are supported")
 29 | 
 30 |     rows = tensor.size(0)
 31 |     cols = tensor.numel() // rows
 32 |     flattened = tensor.new(rows, cols).normal_(0, 1)
 33 | 
 34 |     if rows < cols:
 35 |         flattened.t_()
 36 | 
 37 |     # Compute the qr factorization
 38 |     dt = flattened.dtype
 39 |     flattened = flattened.to(torch.float32)  # orthogonal init does not support fp16
 40 |     q, r = torch.qr(flattened)
 41 |     q, r = q.to(dtype=dt), r.to(dtype=dt)
 42 |     # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
 43 |     d = torch.diag(r, 0)
 44 |     ph = d.sign()
 45 |     q *= ph
 46 | 
 47 |     if rows < cols:
 48 |         q.t_()
 49 | 
 50 |     with torch.no_grad():
 51 |         tensor.view_as(q).copy_(q)
 52 |         tensor.mul_(gain)
 53 |     return tensor
 54 | 
 55 | 
 56 | def orthogonal_init_method(n_layers=1):
 57 |     """Fills the input Tensor with a (semi) orthogonal matrix, as described in
 58 |     Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013)
 59 |     Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)"""
 60 | 
 61 |     def init_(tensor):
 62 |         return _orthogonal(tensor, math.sqrt(2 / n_layers))
 63 | 
 64 |     return init_
 65 | 
 66 | 
 67 | def xavier_uniform_init_method():
 68 |     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
 69 |     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution."""
 70 | 
 71 |     def init_(tensor):
 72 |         return torch.nn.init.xavier_uniform_(tensor)
 73 | 
 74 |     return init_
 75 | 
 76 | 
 77 | def xavier_normal_init_method():
 78 |     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
 79 |     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution."""
 80 | 
 81 |     def init_(tensor):
 82 |         return torch.nn.init.xavier_normal_(tensor)
 83 | 
 84 |     return init_
 85 | 
 86 | 
 87 | def small_init_init_method(dim):
 88 |     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
 89 |     the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
 90 |     std = math.sqrt(2 / (5 * dim))
 91 | 
 92 |     def init_(tensor):
 93 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
 94 | 
 95 |     return init_
 96 | 
 97 | 
 98 | def wang_init_method(n_layers, dim):
 99 |     std = 2 / n_layers / math.sqrt(dim)
100 | 
101 |     def init_(tensor):
102 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
103 | 
104 |     return init_
105 | 
106 | 
107 | def get_init_methods(args):
108 |     def _get(name):
109 |         if name == "normal":
110 |             return init_method_normal(args.init_method_std)
111 |         elif name == "scaled_normal":
112 |             return scaled_init_method_normal(args.init_method_std, args.num_layers)
113 |         elif name == "orthogonal":
114 |             return orthogonal_init_method()
115 |         elif name == "scaled_orthogonal":
116 |             return orthogonal_init_method(args.num_layers)
117 |         elif name == "xavier_uniform":
118 |             return xavier_uniform_init_method()
119 |         elif name == "xavier_normal":
120 |             return xavier_normal_init_method()
121 |         elif name == "wang_init":
122 |             return wang_init_method(args.num_layers, args.hidden_size)
123 |         elif name == "small_init":
124 |             return small_init_init_method(args.hidden_size)
125 |         else:
126 |             raise NotImplementedError(f"Unkown init method {name}")
127 | 
128 |     return _get(args.init_method), _get(args.output_layer_init_method)
129 | 


--------------------------------------------------------------------------------
/megatron/model/activations.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | import torch.nn.functional as F
 17 | 
 18 | torch._C._jit_set_profiling_mode(False)
 19 | torch._C._jit_set_profiling_executor(False)
 20 | torch._C._jit_override_can_fuse_on_cpu(True)
 21 | torch._C._jit_override_can_fuse_on_gpu(True)
 22 | 
 23 | 
 24 | def get_activation(neox_args):
 25 |     """retrieves the activation function specified in neox_args"""
 26 |     if neox_args.activation == "geglu":
 27 |         activation_func = GEGLU(neox_args=neox_args)
 28 |     elif neox_args.activation == "gelu":
 29 |         if neox_args.onnx_safe and neox_args.bias_gelu_fusion:
 30 |             raise ValueError("onnx_safe + bias_gelu_fusion not compatible")
 31 |         if neox_args.onnx_safe:
 32 |             activation_func = erf_gelu
 33 |         elif neox_args.bias_gelu_fusion:
 34 |             activation_func = bias_gelu_impl
 35 |         else:
 36 |             activation_func = F.gelu
 37 |     elif neox_args.activation == "relu":
 38 |         activation_func = F.relu
 39 |     elif neox_args.activation == "softsign":
 40 |         activation_func = F.softsign
 41 |     elif neox_args.activation == "swish":
 42 |         activation_func = swish
 43 |     elif neox_args.activation == "mish":
 44 |         activation_func = mish
 45 |     else:
 46 |         raise ValueError(f"Activation function {neox_args.activation} not recognized")
 47 |     return activation_func
 48 | 
 49 | 
 50 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 51 | # 1/sqrt(2*pi)-> 0.3989423
 52 | # 1/sqrt(2)   -> 0.70710678
 53 | # sqrt(2/pi)  -> 0.79788456
 54 | # this function is tanh approximation of gelu
 55 | # actual gelu is:
 56 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
 57 | 
 58 | 
 59 | @torch.jit.script
 60 | def bias_gelu(bias, y):
 61 |     x = bias + y
 62 |     return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
 63 | 
 64 | 
 65 | # gradient of tanh approximation of gelu
 66 | # gradient of actual gelu is:
 67 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
 68 | @torch.jit.script
 69 | def bias_gelu_back(g, bias, y):
 70 |     x = bias + y
 71 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
 72 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
 73 |     ff = 0.5 * x * (
 74 |         (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)
 75 |     ) + 0.5 * (1 + tanh_out)
 76 |     return ff * g
 77 | 
 78 | 
 79 | class GeLUFunction(torch.autograd.Function):
 80 |     @staticmethod
 81 |     # bias is an optional argument
 82 |     def forward(ctx, input, bias):
 83 |         ctx.save_for_backward(input, bias)
 84 |         return bias_gelu(bias, input)
 85 | 
 86 |     @staticmethod
 87 |     def backward(ctx, grad_output):
 88 |         input, bias = ctx.saved_tensors
 89 |         tmp = bias_gelu_back(grad_output, bias, input)
 90 |         return tmp, tmp
 91 | 
 92 | 
 93 | bias_gelu_impl = GeLUFunction.apply
 94 | 
 95 | 
 96 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
 97 | @torch.jit.script
 98 | def erf_gelu(x):
 99 |     return (
100 |         x
101 |         * 0.5
102 |         * (
103 |             torch.erf(x / 1.41421).to(dtype=x.dtype)
104 |             + torch.ones_like(x).to(dtype=x.dtype)
105 |         )
106 |     )
107 | 
108 | 
109 | @torch.jit.script
110 | def swish(x, beta: float = 1.0):
111 |     return x * torch.sigmoid(beta * x)
112 | 
113 | 
114 | @torch.jit.script
115 | def mish(x):
116 |     return x * torch.tanh(F.softplus(x))
117 | 
118 | 
119 | class GEGLU(torch.nn.Module):
120 |     def __init__(self, neox_args):
121 |         super(GEGLU, self).__init__()
122 |         if neox_args.onnx_safe:
123 |             self.activation_func = erf_gelu
124 |         else:
125 |             self.activation_func = F.gelu
126 | 
127 |     def forward(self, x, bias=None):
128 |         x, gate = x.chunk(2, dim=-1)
129 |         if bias is not None:
130 |             bias_1, bias_2 = bias.chunk(2, dim=-1)
131 |             x = x + bias_1
132 |             gate = gate + bias_2
133 |         intermediate_parallel = self.activation_func(gate)
134 |         return intermediate_parallel * x
135 | 


--------------------------------------------------------------------------------
/megatron/model/gmlp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from megatron.model.fused_softmax import FusedScaleMaskSoftmax
  6 | from megatron.model.activations import get_activation
  7 | from megatron.model.norms import get_norm
  8 | from megatron.model.utils import get_fusion_type
  9 | 
 10 | from megatron import mpu
 11 | 
 12 | 
 13 | class TinyAttention(nn.Module):
 14 |     def __init__(self, neox_args, d_attn, d_ff, mask_fn):
 15 |         super().__init__()
 16 |         self.proj_qkv = nn.Linear(d_ff * 2, 3 * d_attn)
 17 |         self.scale = d_attn ** -0.5
 18 |         self.proj_ffn = nn.Linear(d_attn, d_ff)
 19 |         self.softmax = FusedScaleMaskSoftmax(
 20 |             input_in_fp16=neox_args.precision == "fp16",
 21 |             input_in_bf16=neox_args.precision == "bfloat16",
 22 |             fusion_type=get_fusion_type(neox_args),
 23 |             mask_func=mask_fn,
 24 |             softmax_in_fp32=neox_args.attention_softmax_in_fp32,
 25 |             scale=None,
 26 |         )
 27 | 
 28 |     def forward(self, x, attention_mask):
 29 |         q, k, v = torch.chunk(self.proj_qkv(x), 3, dim=-1)
 30 |         w = torch.einsum("bnd,bmd->bnm", q, k).unsqueeze(1) * self.scale
 31 |         a = self.softmax(
 32 |             w, mask=attention_mask[..., : w.size(-2), : w.size(-1)]
 33 |         ).squeeze(1)
 34 |         x = torch.einsum("bnm,bmd->bnd", a, v)
 35 |         return self.proj_ffn(x)
 36 | 
 37 | 
 38 | class SpatialGatingUnit(nn.Module):
 39 |     def __init__(self, neox_args, d_ff, d_attn=None, causal=True, mask_fn=None):
 40 |         super().__init__()
 41 |         self.causal = causal
 42 |         self.use_attn = d_attn is not None
 43 | 
 44 |         norm, eps = get_norm(neox_args)
 45 |         self.norm = norm(d_ff, eps=eps)
 46 |         self.proj = nn.Linear(neox_args.seq_length, neox_args.seq_length)
 47 |         if self.use_attn:
 48 |             assert mask_fn is not None
 49 |             self.attn = TinyAttention(
 50 |                 neox_args=neox_args, d_attn=d_attn, d_ff=d_ff, mask_fn=mask_fn
 51 |             )
 52 |         nn.init.zeros_(self.proj.weight)
 53 |         nn.init.constant_(self.proj.bias, 1.0)
 54 | 
 55 |     def forward(self, x, attention_mask):
 56 |         device, n = x.device, x.shape[1]
 57 |         x = x.transpose(0, 1)  # [s, b, d] -> [b, s, d]
 58 | 
 59 |         res, gate = x.chunk(2, dim=-1)  # split along dim
 60 |         gate = self.norm(gate)
 61 | 
 62 |         weight, bias = self.proj.weight, self.proj.bias
 63 |         if self.causal:
 64 |             weight, bias = weight[:n, :n], bias[:n]
 65 |             mask = torch.ones(weight.shape[:2], device=device).triu_(1).bool()
 66 |             weight = weight.masked_fill(mask, 0.0)
 67 | 
 68 |         gate = F.linear(gate.transpose(2, 1), weight, self.proj.bias).transpose(2, 1)
 69 | 
 70 |         if self.use_attn:
 71 |             gate = gate + self.attn(x, attention_mask)
 72 | 
 73 |         return (gate * res).transpose(0, 1)  # [b, s, d] -> [s, b, d]
 74 | 
 75 | 
 76 | class GMLPBlock(nn.Module):
 77 |     def __init__(
 78 |         self,
 79 |         neox_args,
 80 |         init_method,
 81 |         output_layer_init_method,
 82 |         layer_number,
 83 |         ff_mult=4,
 84 |         mask_fn=None,
 85 |     ):
 86 |         super().__init__()
 87 |         self.layer_number = layer_number
 88 | 
 89 |         ff_dim = neox_args.hidden_size * ff_mult
 90 |         norm, eps = get_norm(neox_args)
 91 |         self.norm = norm(neox_args.hidden_size, eps=eps)
 92 |         self.input_linear = mpu.ColumnParallelLinear(
 93 |             neox_args=neox_args,
 94 |             input_size=neox_args.hidden_size,
 95 |             output_size=ff_dim * 2,
 96 |             gather_output=False,
 97 |             init_method=init_method,
 98 |             skip_bias_add=True,
 99 |         )
100 |         self.activation_func = get_activation(neox_args)
101 |         ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size())
102 |         if neox_args.attention_config[layer_number] == "amlp":
103 |             d_attn = neox_args.gmlp_attn_dim
104 |         else:
105 |             d_attn = None
106 |         self.sgu = SpatialGatingUnit(
107 |             neox_args, ff_dim_parallel, d_attn, causal=True, mask_fn=mask_fn
108 |         )
109 |         self.output_linear = mpu.RowParallelLinear(
110 |             neox_args=neox_args,
111 |             input_size=ff_dim,
112 |             output_size=neox_args.hidden_size,
113 |             input_is_parallel=True,
114 |             init_method=output_layer_init_method,
115 |             skip_bias_add=True,
116 |         )
117 | 
118 |     def forward(self, args):
119 |         assert len(args) == 2, "GMLPBlock expects 2 arguments"
120 |         x, attention_mask = args
121 |         x = self.norm(x)
122 |         x, _ = self.input_linear(x)
123 |         x = self.activation_func(x)
124 |         x = self.sgu(x, attention_mask)
125 |         x, _ = self.output_linear(x)
126 |         return x, attention_mask
127 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_load.py:
--------------------------------------------------------------------------------
  1 | """
  2 | load all confings in neox/configs in order to perform validations implemented in NeoXArgs
  3 | """
  4 | import pytest
  5 | import yaml
  6 | from ..common import get_configs_with_path
  7 | 
  8 | 
  9 | def run_neox_args_load_test(yaml_files):
 10 |     from megatron.neox_arguments import NeoXArgs
 11 | 
 12 |     yaml_list = get_configs_with_path(yaml_files)
 13 |     args_loaded = NeoXArgs.from_ymls(yaml_list)
 14 |     assert isinstance(args_loaded, NeoXArgs)
 15 | 
 16 |     # initialize an empty config dictionary to be filled by yamls
 17 |     config = dict()
 18 | 
 19 |     # iterate of all to be loaded yaml files
 20 |     for conf_file_name in yaml_list:
 21 | 
 22 |         # load file
 23 |         with open(conf_file_name) as conf_file:
 24 |             conf = yaml.load(conf_file, Loader=yaml.FullLoader)
 25 | 
 26 |         # check for key duplicates and load values
 27 |         for conf_key, conf_value in conf.items():
 28 |             if conf_key in config:
 29 |                 raise ValueError(
 30 |                     f"Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}"
 31 |                 )
 32 | 
 33 |             conf_key_converted = conf_key.replace(
 34 |                 "-", "_"
 35 |             )  # TODO remove replace and update configuration files?
 36 |             config[conf_key_converted] = conf_value
 37 | 
 38 |     # validate that neox args has the same value as specified in the config (if specified in the config)
 39 |     for k, v in config.items():
 40 |         neox_args_value = getattr(args_loaded, k)
 41 |         assert v == neox_args_value, (
 42 |             "loaded neox args value "
 43 |             + str(k)
 44 |             + " == "
 45 |             + str(neox_args_value)
 46 |             + " different from config file "
 47 |             + str(v)
 48 |         )
 49 | 
 50 | 
 51 | @pytest.mark.cpu
 52 | def test_neoxargs_load_arguments_small_local_setup():
 53 |     """
 54 |     verify small.yml can be loaded without raising validation errors
 55 |     """
 56 |     run_neox_args_load_test(["small.yml", "local_setup.yml"])
 57 | 
 58 | 
 59 | @pytest.mark.cpu
 60 | def test_neoxargs_load_arguments_small_local_setup_text_generation():
 61 |     """
 62 |     verify small.yml can be loaded together with text generation without raising validation errors
 63 |     """
 64 |     run_neox_args_load_test(["small.yml", "local_setup.yml", "text_generation.yml"])
 65 | 
 66 | 
 67 | @pytest.mark.cpu
 68 | def test_neoxargs_load_arguments_medium_local_setup():
 69 |     """
 70 |     verify medium.yml can be loaded without raising validation errors
 71 |     """
 72 |     run_neox_args_load_test(["medium.yml", "local_setup.yml"])
 73 | 
 74 | 
 75 | @pytest.mark.cpu
 76 | def test_neoxargs_load_arguments_large_local_setup():
 77 |     """
 78 |     verify large.yml can be loaded without raising validation errors
 79 |     """
 80 |     run_neox_args_load_test(["large.yml", "local_setup.yml"])
 81 | 
 82 | 
 83 | @pytest.mark.cpu
 84 | def test_neoxargs_load_arguments_2_7B_local_setup():
 85 |     """
 86 |     verify 2-7B.yml can be loaded without raising validation errors
 87 |     """
 88 |     run_neox_args_load_test(["2-7B.yml", "local_setup.yml"])
 89 | 
 90 | 
 91 | @pytest.mark.cpu
 92 | def test_neoxargs_load_arguments_6_7B_local_setup():
 93 |     """
 94 |     verify 6-7B.yml can be loaded without raising validation errors
 95 |     """
 96 |     run_neox_args_load_test(["6-7B.yml", "local_setup.yml"])
 97 | 
 98 | 
 99 | @pytest.mark.cpu
100 | def test_neoxargs_load_arguments_13B_local_setup():
101 |     """
102 |     verify 13B.yml can be loaded without raising validation errors
103 |     """
104 |     run_neox_args_load_test(["13B.yml", "local_setup.yml"])
105 | 
106 | 
107 | @pytest.mark.cpu
108 | def test_neoxargs_load_arguments_XL_local_setup():
109 |     """
110 |     verify XL.yml can be loaded without raising validation errors
111 |     """
112 |     run_neox_args_load_test(["XL.yml", "local_setup.yml"])
113 | 
114 | 
115 | @pytest.mark.cpu
116 | def test_neoxargs_load_arguments_175B_local_setup():
117 |     """
118 |     verify 13B.yml can be loaded without raising validation errors
119 |     """
120 |     run_neox_args_load_test(["175B.yml", "local_setup.yml"])
121 | 
122 | 
123 | @pytest.mark.cpu
124 | def test_neoxargs_fail_instantiate_without_required_params():
125 |     """
126 |     verify assertion error if required arguments are not provided
127 |     """
128 | 
129 |     try:
130 |         run_neox_args_load_test(["local_setup.yml"])
131 |         assert False
132 |     except Exception as e:
133 |         assert True
134 | 
135 | 
136 | @pytest.mark.cpu
137 | def test_neoxargs_fail_instantiate_without_any_params():
138 |     """
139 |     verify assertion error if required arguments are not provided
140 |     """
141 |     from megatron.neox_arguments import NeoXArgs
142 | 
143 |     try:
144 |         args_loaded = NeoXArgs()
145 |         assert False
146 |     except Exception as e:
147 |         assert True
148 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <ATen/cuda/CUDAContext.h>
 19 | #include <cuda.h>
 20 | #include <cuda_fp16.h>
 21 | #include <cuda_profiler_api.h>
 22 | #include <cuda_runtime.h>
 23 | #include <torch/extension.h>
 24 | #include "scaled_masked_softmax.h"
 25 | #include "type_shim.h"
 26 | 
 27 | namespace multihead_attn {
 28 | namespace fused_softmax {
 29 | namespace scaled_masked_softmax {
 30 | 
 31 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads)
 32 | {
 33 |     return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
 34 | }
 35 | 
 36 | torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor)
 37 | {
 38 |     // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 39 |     const int batches = input.size(0);
 40 |     const int pad_batches = mask.size(0);
 41 |     const int attn_heads = input.size(1);
 42 |     const int query_seq_len = input.size(2);
 43 |     const int key_seq_len = input.size(3);
 44 |     TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
 45 |     TORCH_INTERNAL_ASSERT(query_seq_len > 1);
 46 |     TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 47 |     TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 48 |     TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
 49 |     TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
 50 | 
 51 |     // Output
 52 |     auto act_options = input.options().requires_grad(false);
 53 |     torch::Tensor softmax_results =
 54 |         torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
 55 | 
 56 |     // Softmax Intermediate Result Ptr
 57 |     void* input_ptr = static_cast<void*>(input.data_ptr());
 58 |     void* mask_ptr = static_cast<void*>(mask.data_ptr());
 59 |     void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 60 | 
 61 |     DISPATCH_HALF_AND_BFLOAT(input.scalar_type(),
 62 |                              "dispatch_scaled_masked_softmax_forward",
 63 |                              dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
 64 |                                  reinterpret_cast<scalar_t*>(softmax_results_ptr),
 65 |                                  reinterpret_cast<const scalar_t*>(input_ptr),
 66 |                                  reinterpret_cast<const uint8_t*>(mask_ptr),
 67 |                                  scale_factor,
 68 |                                  query_seq_len,
 69 |                                  key_seq_len,
 70 |                                  batches,
 71 |                                  attn_heads,
 72 |                                  pad_batches););
 73 |     return softmax_results;
 74 | }
 75 | 
 76 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads_,
 77 |                        torch::Tensor const& softmax_results_,
 78 |                        float scale_factor)
 79 | {
 80 |     auto output_grads = output_grads_.contiguous();
 81 |     auto softmax_results = softmax_results_.contiguous();
 82 | 
 83 |     // output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 84 |     const int batches = output_grads.size(0);
 85 |     const int attn_heads = output_grads.size(1);
 86 |     const int query_seq_len = output_grads.size(2);
 87 |     const int key_seq_len = output_grads.size(3);
 88 | 
 89 |     void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 90 | 
 91 |     // Softmax Grad
 92 |     DISPATCH_HALF_AND_BFLOAT(output_grads_.scalar_type(),
 93 |                              "dispatch_scaled_masked_softmax_backward",
 94 |                              dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
 95 |                                  reinterpret_cast<scalar_t*>(output_grads_ptr),
 96 |                                  reinterpret_cast<scalar_t*>(output_grads_ptr),
 97 |                                  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
 98 |                                  scale_factor,
 99 |                                  query_seq_len,
100 |                                  key_seq_len,
101 |                                  batches,
102 |                                  attn_heads););
103 | 
104 |     // backward pass is completely in-place
105 |     return output_grads;
106 | }
107 | }  // namespace scaled_masked_softmax
108 | }  // namespace fused_softmax
109 | }  // namespace multihead_attn
110 | 


--------------------------------------------------------------------------------
/megatron/mpu/cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_model_parallel_group
 19 | from .initialize import get_model_parallel_rank
 20 | from .initialize import get_model_parallel_world_size
 21 | from .utils import VocabUtility
 22 | 
 23 | 
 24 | class _VocabParallelCrossEntropy(torch.autograd.Function):
 25 |     @staticmethod
 26 |     def forward(ctx, vocab_parallel_logits, target):
 27 | 
 28 |         # Maximum value along vocab dimension across all GPUs.
 29 |         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
 30 |         torch.distributed.all_reduce(
 31 |             logits_max,
 32 |             op=torch.distributed.ReduceOp.MAX,
 33 |             group=get_model_parallel_group(),
 34 |         )
 35 |         # Subtract the maximum value.
 36 |         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 37 | 
 38 |         # Get the partition's vocab indecies
 39 |         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
 40 |         partition_vocab_size = vocab_parallel_logits.size()[-1]
 41 |         rank = get_model_parallel_rank()
 42 |         world_size = get_model_parallel_world_size()
 43 |         vocab_start_index, vocab_end_index = get_vocab_range(
 44 |             partition_vocab_size, rank, world_size
 45 |         )
 46 | 
 47 |         # Create a mask of valid vocab ids (1 means it needs to be masked).
 48 |         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
 49 |         masked_target = target.clone() - vocab_start_index
 50 |         masked_target[target_mask] = 0
 51 | 
 52 |         # Get predicted-logits = logits[target].
 53 |         # For Simplicity, we convert logits to a 2-D tensor with size
 54 |         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
 55 |         logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
 56 |         masked_target_1d = masked_target.view(-1)
 57 |         arange_1d = torch.arange(
 58 |             start=0, end=logits_2d.size()[0], device=logits_2d.device
 59 |         )
 60 |         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
 61 |         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
 62 |         predicted_logits = predicted_logits_1d.view_as(target)
 63 |         predicted_logits[target_mask] = 0.0
 64 |         # All reduce is needed to get the chunks from other GPUs.
 65 |         torch.distributed.all_reduce(
 66 |             predicted_logits,
 67 |             op=torch.distributed.ReduceOp.SUM,
 68 |             group=get_model_parallel_group(),
 69 |         )
 70 | 
 71 |         # Sum of exponential of logits along vocab dimension across all GPUs.
 72 |         exp_logits = vocab_parallel_logits
 73 |         torch.exp(vocab_parallel_logits, out=exp_logits)
 74 |         sum_exp_logits = exp_logits.sum(dim=-1)
 75 |         torch.distributed.all_reduce(
 76 |             sum_exp_logits,
 77 |             op=torch.distributed.ReduceOp.SUM,
 78 |             group=get_model_parallel_group(),
 79 |         )
 80 | 
 81 |         # Loss = log(sum(exp(logits))) - predicted-logit.
 82 |         loss = torch.log(sum_exp_logits) - predicted_logits
 83 | 
 84 |         # Store softmax, target-mask and masked-target for backward pass.
 85 |         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
 86 |         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
 87 | 
 88 |         return loss
 89 | 
 90 |     @staticmethod
 91 |     def backward(ctx, grad_output):
 92 | 
 93 |         # Retreive tensors from the forward path.
 94 |         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 95 | 
 96 |         # All the inputs have softmax as thier gradient.
 97 |         grad_input = softmax
 98 |         # For simplicity, work with the 2D gradient.
 99 |         partition_vocab_size = softmax.size()[-1]
100 |         grad_2d = grad_input.view(-1, partition_vocab_size)
101 | 
102 |         # Add the gradient from matching classes.
103 |         arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
104 |         grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
105 | 
106 |         # Finally elementwise multiplication with the output gradients.
107 |         grad_input.mul_(grad_output.unsqueeze(dim=-1))
108 | 
109 |         return grad_input, None
110 | 
111 | 
112 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
113 |     """Helper function for the cross entropy."""
114 |     return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
115 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | # Refer to the following link for the explanation of each params:
  3 | #   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
  4 | Language: Cpp
  5 | # BasedOnStyle: Google
  6 | AccessModifierOffset: -4
  7 | AlignAfterOpenBracket: Align
  8 | AlignConsecutiveAssignments: false
  9 | AlignConsecutiveDeclarations: false
 10 | AlignEscapedNewlines: Left
 11 | AlignOperands: true
 12 | AlignTrailingComments: true
 13 | AllowAllParametersOfDeclarationOnNextLine: false
 14 | AllowShortBlocksOnASingleLine: true
 15 | AllowShortCaseLabelsOnASingleLine: true
 16 | AllowShortFunctionsOnASingleLine: All
 17 | AllowShortIfStatementsOnASingleLine: true
 18 | AllowShortLoopsOnASingleLine: true
 19 | # This is deprecated
 20 | AlwaysBreakAfterDefinitionReturnType: None
 21 | AlwaysBreakAfterReturnType: None
 22 | AlwaysBreakBeforeMultilineStrings: true
 23 | AlwaysBreakTemplateDeclarations: true
 24 | BinPackArguments:  false
 25 | BinPackParameters: false
 26 | BraceWrapping:
 27 |   AfterClass:            false
 28 |   AfterControlStatement: false
 29 |   AfterEnum:             false
 30 |   AfterFunction:         false
 31 |   AfterNamespace:        false
 32 |   AfterObjCDeclaration:  false
 33 |   AfterStruct:           false
 34 |   AfterUnion:            false
 35 |   AfterExternBlock:      false
 36 |   BeforeCatch:           false
 37 |   BeforeElse:            false
 38 |   IndentBraces:          false
 39 |   # disabling the below splits, else, they'll just add to the vertical length of source files!
 40 |   SplitEmptyFunction: false
 41 |   SplitEmptyRecord: false
 42 |   SplitEmptyNamespace: false
 43 | BreakBeforeBinaryOperators: None
 44 | BreakBeforeBraces: WebKit
 45 | BreakBeforeInheritanceComma: false
 46 | BreakInheritanceList: BeforeColon
 47 | BreakBeforeTernaryOperators: true
 48 | BreakConstructorInitializersBeforeComma: false
 49 | BreakConstructorInitializers: BeforeColon
 50 | BreakAfterJavaFieldAnnotations: false
 51 | BreakStringLiterals: true
 52 | ColumnLimit: 100
 53 | CommentPragmas: '^ IWYU pragma:'
 54 | CompactNamespaces: false
 55 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 56 | # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
 57 | ConstructorInitializerIndentWidth: 4
 58 | ContinuationIndentWidth: 4
 59 | Cpp11BracedListStyle: true
 60 | DerivePointerAlignment: false
 61 | DisableFormat: false
 62 | ExperimentalAutoDetectBinPacking: false
 63 | FixNamespaceComments: true
 64 | ForEachMacros:
 65 |   - foreach
 66 |   - Q_FOREACH
 67 |   - BOOST_FOREACH
 68 | IncludeBlocks: Preserve
 69 | IncludeCategories:
 70 |   - Regex:           '^<ext/.*\.h>'
 71 |     Priority:        2
 72 |   - Regex:           '^<.*\.h>'
 73 |     Priority:        1
 74 |   - Regex:           '^<.*'
 75 |     Priority:        2
 76 |   - Regex:           '.*'
 77 |     Priority:        3
 78 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 79 | IndentCaseLabels: true
 80 | IndentPPDirectives: None
 81 | IndentWidth:     4
 82 | IndentWrappedFunctionNames: false
 83 | JavaScriptQuotes: Leave
 84 | JavaScriptWrapImports: true
 85 | KeepEmptyLinesAtTheStartOfBlocks: false
 86 | MacroBlockBegin: ''
 87 | MacroBlockEnd:   ''
 88 | MaxEmptyLinesToKeep: 1
 89 | NamespaceIndentation: None
 90 | ObjCBinPackProtocolList: Never
 91 | ObjCBlockIndentWidth: 4
 92 | ObjCSpaceAfterProperty: false
 93 | ObjCSpaceBeforeProtocolList: true
 94 | PenaltyBreakAssignment: 4
 95 | PenaltyBreakBeforeFirstCallParameter: 1
 96 | PenaltyBreakComment: 300
 97 | PenaltyBreakFirstLessLess: 120
 98 | PenaltyBreakString: 1000
 99 | PenaltyBreakTemplateDeclaration: 10
100 | PenaltyExcessCharacter: 1000000
101 | PenaltyReturnTypeOnItsOwnLine: 200
102 | PointerAlignment: Left
103 | RawStringFormats:
104 |   - Language: Cpp
105 |     Delimiters:
106 |       - cc
107 |       - CC
108 |       - cpp
109 |       - Cpp
110 |       - CPP
111 |       - 'c++'
112 |       - 'C++'
113 |     CanonicalDelimiter: ''
114 |   - Language: TextProto
115 |     Delimiters:
116 |       - pb
117 |       - PB
118 |       - proto
119 |       - PROTO
120 |     EnclosingFunctions:
121 |       - EqualsProto
122 |       - EquivToProto
123 |       - PARSE_PARTIAL_TEXT_PROTO
124 |       - PARSE_TEST_PROTO
125 |       - PARSE_TEXT_PROTO
126 |       - ParseTextOrDie
127 |       - ParseTextProtoOrDie
128 |     CanonicalDelimiter: ''
129 |     BasedOnStyle: google
130 | # Enabling comment reflow causes doxygen comments to be messed up in their formats!
131 | ReflowComments: true
132 | SortIncludes: true
133 | SortUsingDeclarations: true
134 | SpaceAfterCStyleCast: false
135 | SpaceAfterTemplateKeyword: true
136 | SpaceBeforeAssignmentOperators: true
137 | SpaceBeforeCpp11BracedList: false
138 | SpaceBeforeCtorInitializerColon: true
139 | SpaceBeforeInheritanceColon: true
140 | SpaceBeforeParens: ControlStatements
141 | SpaceBeforeRangeBasedForLoopColon: true
142 | SpaceInEmptyParentheses: false
143 | SpacesBeforeTrailingComments: 2
144 | SpacesInAngles: false
145 | SpacesInContainerLiterals: true
146 | SpacesInCStyleCastParentheses: false
147 | SpacesInParentheses: false
148 | SpacesInSquareBrackets: false
149 | Standard: Cpp11
150 | StatementMacros:
151 |   - Q_UNUSED
152 |   - QT_REQUIRE_VERSION
153 | # Be consistent with indent-width, even for people who use tab for indentation!
154 | TabWidth: 4
155 | UseTab: Never
156 | 


--------------------------------------------------------------------------------
/megatron/learning_rates.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Learning rate decay functions."""
 16 | 
 17 | import math
 18 | 
 19 | from megatron import print_rank_0
 20 | 
 21 | 
 22 | class AnnealingLR(object):
 23 |     """Anneals the learning rate."""
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         optimizer,
 28 |         start_lr,
 29 |         warmup_iter,
 30 |         total_iters,
 31 |         decay_style,
 32 |         last_iter,
 33 |         min_lr=0.0,
 34 |         use_checkpoint_lr_scheduler=True,
 35 |         override_lr_scheduler=False,
 36 |     ):
 37 | 
 38 |         # Class values.
 39 |         self.optimizer = optimizer
 40 |         self.start_lr = start_lr
 41 |         self.min_lr = min_lr
 42 |         self.warmup_iter = warmup_iter
 43 |         self.num_iters = last_iter
 44 |         self.end_iter = total_iters
 45 |         assert self.end_iter > 0
 46 |         self.decay_style = decay_style
 47 |         self.override_lr_scheduler = override_lr_scheduler
 48 |         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
 49 |         if self.override_lr_scheduler:
 50 |             assert not self.use_checkpoint_lr_scheduler, (
 51 |                 "both override and " "use-checkpoint are set."
 52 |             )
 53 |         # Set the learning rate
 54 |         self.step(self.num_iters)
 55 | 
 56 |         print_rank_0("> learning rate decay style: {}".format(self.decay_style))
 57 | 
 58 |     def get_lr(self):
 59 |         """Learning rate decay functions from:
 60 |         https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 61 | 
 62 |         num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
 63 |         # Warmup.
 64 |         if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
 65 |             return float(self.start_lr) * num_iters_ / self.warmup_iter
 66 | 
 67 |         num_iters_ = num_iters_ - self.warmup_iter
 68 |         if self.decay_style == "linear":
 69 |             lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
 70 |         elif self.decay_style == "cosine":
 71 |             lr = (
 72 |                 self.start_lr
 73 |                 / 2.0
 74 |                 * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
 75 |             )
 76 |         elif self.decay_style == "exponential":
 77 |             # exp(-0.693) = 1/2
 78 |             lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
 79 |         else:
 80 |             lr = self.start_lr
 81 |         return max(lr, self.min_lr)
 82 | 
 83 |     def step(self, step_num=None):
 84 |         """Set lr for all parameters groups."""
 85 |         if step_num is None:
 86 |             step_num = self.num_iters + 1
 87 |         self.num_iters = step_num
 88 |         new_lr = self.get_lr()
 89 |         for group in self.optimizer.param_groups:
 90 |             group["lr"] = new_lr
 91 | 
 92 |     def state_dict(self):
 93 |         state_dict = {
 94 |             "start_lr": self.start_lr,
 95 |             "warmup_iter": self.warmup_iter,
 96 |             "num_iters": self.num_iters,
 97 |             "decay_style": self.decay_style,
 98 |             "end_iter": self.end_iter,
 99 |             "min_lr": self.min_lr,
100 |         }
101 |         return state_dict
102 | 
103 |     def _check_and_set(self, cls_value, sd_value, name):
104 |         """Auxiliary function for checking the values in the checkpoint and
105 |         setting them."""
106 |         if self.override_lr_scheduler:
107 |             print_rank_0(" > overriding {} value to {}".format(name, cls_value))
108 |             return cls_value
109 | 
110 |         if not self.use_checkpoint_lr_scheduler:
111 |             assert cls_value == sd_value, (
112 |                 "AnnealingLR: class input value"
113 |                 "and checkpoint values for {} do not match".format(name)
114 |             )
115 |         print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name))
116 |         return sd_value
117 | 
118 |     def load_state_dict(self, sd):
119 | 
120 |         self.start_lr = self._check_and_set(
121 |             self.start_lr, sd["start_lr"], "learning rate"
122 |         )
123 |         self.min_lr = self._check_and_set(
124 |             self.min_lr, sd["min_lr"], "minimum learning rate"
125 |         )
126 |         self.warmup_iter = self._check_and_set(
127 |             self.warmup_iter, sd["warmup_iter"], "warmup iterations"
128 |         )
129 |         self.end_iter = self._check_and_set(
130 |             self.end_iter, sd["end_iter"], "total number of iterations"
131 |         )
132 |         self.decay_style = self._check_and_set(
133 |             self.decay_style, sd["decay_style"], "decay style"
134 |         )
135 | 
136 |         self.num_iters = sd["num_iters"]
137 |         self.step(self.num_iters)
138 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include "compat.h"
19 | 
20 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                                \
21 |     switch (TYPE) {                                                              \
22 |         case at::ScalarType::Half: {                                             \
23 |             using scalar_t = at::Half;                                           \
24 |             __VA_ARGS__;                                                         \
25 |             break;                                                               \
26 |         }                                                                        \
27 |         case at::ScalarType::BFloat16: {                                         \
28 |             using scalar_t = at::BFloat16;                                       \
29 |             __VA_ARGS__;                                                         \
30 |             break;                                                               \
31 |         }                                                                        \
32 |         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
33 |     }
34 | 
35 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...)              \
36 |     switch (TYPEIN) {                                                                       \
37 |         case at::ScalarType::Float: {                                                       \
38 |             using scalar_t_in = float;                                                      \
39 |             switch (TYPEOUT) {                                                              \
40 |                 case at::ScalarType::Float: {                                               \
41 |                     using scalar_t_out = float;                                             \
42 |                     __VA_ARGS__;                                                            \
43 |                     break;                                                                  \
44 |                 }                                                                           \
45 |                 case at::ScalarType::Half: {                                                \
46 |                     using scalar_t_out = at::Half;                                          \
47 |                     __VA_ARGS__;                                                            \
48 |                     break;                                                                  \
49 |                 }                                                                           \
50 |                 case at::ScalarType::BFloat16: {                                            \
51 |                     using scalar_t_out = at::BFloat16;                                      \
52 |                     __VA_ARGS__;                                                            \
53 |                     break;                                                                  \
54 |                 }                                                                           \
55 |                 default: AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
56 |             }                                                                               \
57 |             break;                                                                          \
58 |         }                                                                                   \
59 |         case at::ScalarType::Half: {                                                        \
60 |             using scalar_t_in = at::Half;                                                   \
61 |             using scalar_t_out = at::Half;                                                  \
62 |             __VA_ARGS__;                                                                    \
63 |             break;                                                                          \
64 |         }                                                                                   \
65 |         case at::ScalarType::BFloat16: {                                                    \
66 |             using scalar_t_in = at::BFloat16;                                               \
67 |             using scalar_t_out = at::BFloat16;                                              \
68 |             __VA_ARGS__;                                                                    \
69 |             break;                                                                          \
70 |         }                                                                                   \
71 |         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");          \
72 |     }
73 | 


--------------------------------------------------------------------------------
/megatron/mpu/mappings.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | 
 17 | from .initialize import (
 18 |     get_model_parallel_group,
 19 |     get_model_parallel_world_size,
 20 |     get_model_parallel_rank,
 21 |     get_fp32_allreduce,
 22 | )
 23 | from .utils import split_tensor_along_last_dim
 24 | 
 25 | 
 26 | def _reduce(input_):
 27 |     """All-reduce the the input tensor across model parallel group."""
 28 | 
 29 |     # Bypass the function if we are using only 1 GPU.
 30 |     if get_model_parallel_world_size() == 1:
 31 |         return input_
 32 | 
 33 |     # Bf16 convert
 34 |     dt = input_.dtype
 35 |     if dt == torch.bfloat16 and get_fp32_allreduce():
 36 |         input_ = input_.float()
 37 | 
 38 |     # All-reduce.
 39 |     torch.distributed.all_reduce(input_, group=get_model_parallel_group())
 40 | 
 41 |     # Bf16 convert
 42 |     if dt == torch.bfloat16 and get_fp32_allreduce():
 43 |         input_ = input_.bfloat16()
 44 | 
 45 |     return input_
 46 | 
 47 | 
 48 | def _split(input_):
 49 |     """Split the tensor along its last dimension and keep the
 50 |     corresponding slice."""
 51 | 
 52 |     world_size = get_model_parallel_world_size()
 53 |     # Bypass the function if we are using only 1 GPU.
 54 |     if world_size == 1:
 55 |         return input_
 56 | 
 57 |     # Bf16 convert
 58 |     dt = input_.dtype
 59 |     if dt == torch.bfloat16 and get_fp32_allreduce():
 60 |         input_ = input_.float()
 61 | 
 62 |     # Split along last dimension.
 63 |     input_list = split_tensor_along_last_dim(input_, world_size)
 64 | 
 65 |     # Note: torch.split does not create contiguous tensors by default.
 66 |     rank = get_model_parallel_rank()
 67 |     output = input_list[rank].contiguous()
 68 | 
 69 |     # Bf16 convert
 70 |     if dt == torch.bfloat16 and get_fp32_allreduce():
 71 |         output = output.bfloat16()
 72 | 
 73 |     return output
 74 | 
 75 | 
 76 | def _gather(input_):
 77 |     """Gather tensors and concatinate along the last dimension."""
 78 | 
 79 |     world_size = get_model_parallel_world_size()
 80 |     # Bypass the function if we are using only 1 GPU.
 81 |     if world_size == 1:
 82 |         return input_
 83 | 
 84 |     # Bf16 convert
 85 |     dt = input_.dtype
 86 |     if dt == torch.bfloat16 and get_fp32_allreduce():
 87 |         input_ = input_.float()
 88 | 
 89 |     # Size and dimension.
 90 |     last_dim = input_.dim() - 1
 91 |     rank = get_model_parallel_rank()
 92 | 
 93 |     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
 94 |     tensor_list[rank] = input_
 95 |     torch.distributed.all_gather(tensor_list, input_, group=get_model_parallel_group())
 96 | 
 97 |     # Note: torch.cat already creates a contiguous tensor.
 98 |     output = torch.cat(tensor_list, dim=last_dim).contiguous()
 99 | 
100 |     # Bf16 convert
101 |     if dt == torch.bfloat16 and get_fp32_allreduce():
102 |         output = output.bfloat16()
103 | 
104 |     return output
105 | 
106 | 
107 | class _CopyToModelParallelRegion(torch.autograd.Function):
108 |     """Pass the input to the model parallel region."""
109 | 
110 |     @staticmethod
111 |     def symbolic(graph, input_):
112 |         return input_
113 | 
114 |     @staticmethod
115 |     def forward(ctx, input_):
116 |         return input_
117 | 
118 |     @staticmethod
119 |     def backward(ctx, grad_output):
120 |         return _reduce(grad_output)
121 | 
122 | 
123 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
124 |     """All-reduce the input from the model parallel region."""
125 | 
126 |     @staticmethod
127 |     def symbolic(graph, input_):
128 |         return _reduce(input_)
129 | 
130 |     @staticmethod
131 |     def forward(ctx, input_):
132 |         return _reduce(input_)
133 | 
134 |     @staticmethod
135 |     def backward(ctx, grad_output):
136 |         return grad_output
137 | 
138 | 
139 | class _ScatterToModelParallelRegion(torch.autograd.Function):
140 |     """Split the input and keep only the corresponding chuck to the rank."""
141 | 
142 |     @staticmethod
143 |     def symbolic(graph, input_):
144 |         return _split(input_)
145 | 
146 |     @staticmethod
147 |     def forward(ctx, input_):
148 |         return _split(input_)
149 | 
150 |     @staticmethod
151 |     def backward(ctx, grad_output):
152 |         return _gather(grad_output)
153 | 
154 | 
155 | class _GatherFromModelParallelRegion(torch.autograd.Function):
156 |     """Gather the input from model parallel region and concatinate."""
157 | 
158 |     @staticmethod
159 |     def symbolic(graph, input_):
160 |         return _gather(input_)
161 | 
162 |     @staticmethod
163 |     def forward(ctx, input_):
164 |         return _gather(input_)
165 | 
166 |     @staticmethod
167 |     def backward(ctx, grad_output):
168 |         return _split(grad_output)
169 | 
170 | 
171 | # -----------------
172 | # Helper functions.
173 | # -----------------
174 | 
175 | 
176 | def copy_to_model_parallel_region(input_):
177 |     return _CopyToModelParallelRegion.apply(input_)
178 | 
179 | 
180 | def reduce_from_model_parallel_region(input_):
181 |     return _ReduceFromModelParallelRegion.apply(input_)
182 | 
183 | 
184 | def scatter_to_model_parallel_region(input_):
185 |     return _ScatterToModelParallelRegion.apply(input_)
186 | 
187 | 
188 | def gather_from_model_parallel_region(input_):
189 |     return _GatherFromModelParallelRegion.apply(input_)
190 | 


--------------------------------------------------------------------------------
/tests/model/test_model_train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | instantiate models, save checkpoints, load checkpoints, compare loaded parameters to saved parameters and compare forward pass outputs
  3 | 
  4 | This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs
  5 | to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures.
  6 | """
  7 | import pytest
  8 | 
  9 | from ..common import distributed_test, clear_test_dirs, model_setup, binary, parametrize
 10 | 
 11 | import torch
 12 | import os
 13 | 
 14 | PARAMS_TO_TEST = {
 15 |     "norm,pos_emb,activation": [
 16 |         ["layernorm", "learned", "gelu"],
 17 |         ["rmsnorm", "rotary", "relu"],
 18 |         ["scalenorm", "sinusoidal", "mish"],
 19 |         ["layernorm", "rpe", "geglu"],
 20 |         ["rmsnorm", "none", "swish"],
 21 |     ],
 22 |     "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2]],
 23 |     "no_weight_tying": binary,
 24 |     "attention_config,num_layers": [
 25 |         [[[["global"], "all"]], 2],
 26 |         [[[["local", "global"], "all"]], 12],
 27 |         [[[["sparse_variable", "global"], "all"]], 12],
 28 |         [[[["sparse_fixed", "global"], "all"]], 12],
 29 |     ],  # the sparse attention models need more layers to be stable
 30 |     "scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [
 31 |         [True, False],
 32 |         [False, True],
 33 |     ],
 34 |     "checkpoint_activations": binary,
 35 |     "log_gradient_noise_scale": [True],
 36 |     "sparsity_config": [
 37 |         {
 38 |             "block": 16,  # block size
 39 |             "num_local_blocks": 32,
 40 |         }
 41 |     ],
 42 | }
 43 | 
 44 | 
 45 | parameters, names = parametrize(
 46 |     PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 47 | )
 48 | 
 49 | 
 50 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
 51 | def test_train(param_dict):
 52 |     @distributed_test(world_size=2)
 53 |     def wrapper():
 54 |         run_train_test(param_dict=param_dict)
 55 | 
 56 |     wrapper()
 57 | 
 58 | 
 59 | BF16_PARAMS_TO_TEST = {
 60 |     "fp16,fp32_allreduce": [
 61 |         [
 62 |             {
 63 |                 "enabled": True,
 64 |                 "type": "bfloat16",
 65 |                 "loss_scale": 0,
 66 |                 "loss_scale_window": 1000,
 67 |                 "hysteresis": 2,
 68 |                 "min_loss_scale": 1,
 69 |             },
 70 |             True,
 71 |         ],
 72 |         [
 73 |             {
 74 |                 "enabled": True,
 75 |                 "loss_scale": 0,
 76 |                 "loss_scale_window": 1000,
 77 |                 "hysteresis": 2,
 78 |                 "min_loss_scale": 1,
 79 |             },
 80 |             False,
 81 |         ],
 82 |     ]
 83 | }
 84 | 
 85 | parameters, names = parametrize(
 86 |     BF16_PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 87 | )
 88 | 
 89 | 
 90 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
 91 | def test_train_bf16(param_dict):
 92 |     @distributed_test(world_size=2)
 93 |     def wrapper():
 94 |         run_train_test(param_dict=param_dict)
 95 | 
 96 |     wrapper()
 97 | 
 98 | 
 99 | OPTIMIZER_PARAMS = {
100 |     "optimizer": [
101 |         {"type": "adam", "params": {"lr": 0.0006}},
102 |         {"type": "onebitadam", "params": {"lr": 0.0006}},
103 |         {"type": "cpu_adam", "params": {"lr": 0.0006}},
104 |         {"type": "cpu_torch_adam", "params": {"lr": 0.0006}},
105 |         {"type": "sm3", "params": {"lr": 0.0006}},
106 |         {"type": "madgrad_wd", "params": {"lr": 0.0006}},
107 |     ]
108 | }
109 | opt_params, opt_name = parametrize(
110 |     OPTIMIZER_PARAMS, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
111 | )
112 | 
113 | 
114 | @pytest.mark.parametrize("param_dict", parameters, ids=names)
115 | def test_train_optimizers(param_dict):
116 |     @distributed_test(world_size=2)
117 |     def wrapper():
118 |         run_train_test(param_dict=param_dict)
119 | 
120 |     wrapper()
121 | 
122 | 
123 | def run_train_test(yaml_list=None, param_dict=None):
124 |     from megatron.training import train_step
125 |     from megatron.utils import Timers
126 | 
127 |     max_steps = 64
128 | 
129 |     model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
130 | 
131 |     model.train()
132 | 
133 |     timers = Timers(use_wandb=False, tensorboard_writer=None)
134 | 
135 |     # generate some random data on which we can overfit
136 |     # context size of data is model seq_len + 1 in order to compute loss
137 |     data_list = list()
138 |     context_tokens_tensor = torch.randint(
139 |         0, args_loaded.padded_vocab_size, (4, args_loaded.seq_length + 1)
140 |     ).to(torch.int64)
141 |     for i in range(max_steps):
142 |         data_list.append({"text": context_tokens_tensor.clone()})
143 |     data_iterator = iter(data_list)
144 | 
145 |     # run train_step until the loss decreases
146 |     losses = list()
147 |     for i in range(max_steps):
148 |         loss_dict, skipped_iter = train_step(
149 |             neox_args=args_loaded,
150 |             timers=timers,
151 |             data_iterator=data_iterator,
152 |             model=model,
153 |             optimizer=optimizer,
154 |             lr_scheduler=lr_scheduler,
155 |         )
156 |         losses.append(loss_dict["lm_loss"])
157 |         if len(losses) >= 2:
158 |             if torch.isnan(losses[-1]):
159 |                 continue
160 |             if torch.isnan(losses[-2]):
161 |                 continue
162 |             if losses[-1] < losses[-2]:
163 |                 return  # all good
164 | 
165 |     # loss should have decreased by now (otherwise increasing the max_steps parameter could have the testcase pass)
166 |     assert losses[-1] < losses[-2], (
167 |         "run_train_test() loss going down within " + str(max_steps) + " steps"
168 |     )
169 | 
170 |     if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
171 |         clear_test_dirs()
172 | 


--------------------------------------------------------------------------------
/megatron/model/positional_embeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | 
  4 | 
  5 | class SinusoidalPositionalEmbedding(torch.nn.Module):
  6 |     def __init__(self, dim, base=10000, precision=torch.half):
  7 |         super().__init__()
  8 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
  9 |         self.register_buffer("inv_freq", inv_freq)
 10 |         self.precision = precision
 11 | 
 12 |     def forward(self, x, seq_dim=1):
 13 |         t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
 14 |         sinusoid_inp = torch.einsum("i,j->ij", t, self.inv_freq)
 15 |         if self.precision == torch.bfloat16:
 16 |             sinusoid_inp = sinusoid_inp.float()
 17 |         sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
 18 |         if self.precision == torch.bfloat16:
 19 |             sin, cos = sin.bfloat16(), cos.bfloat16()
 20 |         emb = torch.cat((sin, cos), dim=-1)
 21 |         return emb[None, :, :]
 22 | 
 23 | 
 24 | class RotaryEmbedding(torch.nn.Module):
 25 |     def __init__(self, dim, base=10000, precision=torch.half):
 26 |         super().__init__()
 27 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
 28 |         self.register_buffer("inv_freq", inv_freq)
 29 |         self.seq_len_cached = None
 30 |         self.cos_cached = None
 31 |         self.sin_cached = None
 32 |         self.precision = precision
 33 | 
 34 |     def forward(self, x, seq_dim=1, seq_len=None):
 35 |         if seq_len is None:
 36 |             seq_len = x.shape[seq_dim]
 37 |         if seq_len != self.seq_len_cached:
 38 |             self.seq_len_cached = seq_len
 39 |             t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
 40 |             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
 41 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
 42 |             if self.precision == torch.bfloat16:
 43 |                 emb = emb.float()
 44 |             self.cos_cached = emb.cos()[:, None, None, :]
 45 |             self.sin_cached = emb.sin()[:, None, None, :]
 46 |             if self.precision == torch.bfloat16:
 47 |                 self.cos_cached = self.cos_cached.bfloat16()
 48 |                 self.sin_cached = self.sin_cached.bfloat16()
 49 |         return self.cos_cached, self.sin_cached
 50 | 
 51 | 
 52 | # rotary pos emb helpers:
 53 | 
 54 | 
 55 | def rotate_half(x):
 56 |     x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
 57 |     return torch.cat(
 58 |         (-x2, x1), dim=x1.ndim - 1
 59 |     )  # dim=-1 triggers a bug in earlier torch versions
 60 | 
 61 | 
 62 | @torch.jit.script
 63 | def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
 64 |     cos, sin = (
 65 |         cos[offset : q.shape[0] + offset, ...],
 66 |         sin[offset : q.shape[0] + offset, ...],
 67 |     )
 68 |     return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 69 | 
 70 | 
 71 | def apply_rotary_pos_emb_torch(
 72 |     q, k, cos, sin, offset: int = 0
 73 | ):  # jitting fails with bf16
 74 |     cos, sin = (
 75 |         cos[offset : q.shape[0] + offset, ...],
 76 |         sin[offset : q.shape[0] + offset, ...],
 77 |     )
 78 |     return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 79 | 
 80 | 
 81 | class AliBi(torch.nn.Module):
 82 |     def __init__(self, num_heads, mp_size=1, mp_rank=1):
 83 |         super().__init__()
 84 |         # megatron splits across heads, so we need to make sure each
 85 |         # head receives the correct matrix
 86 |         assert mp_size <= num_heads and mp_rank <= mp_size
 87 |         self.mp_size = mp_size
 88 |         self.mp_rank = mp_rank
 89 |         self.num_heads = num_heads
 90 |         self.slice_size = num_heads // mp_size
 91 |         self.cached_matrix = None
 92 |         self.cached_seq_len = None
 93 |         slopes = torch.Tensor(self._get_slopes(num_heads))[
 94 |             mp_rank * self.slice_size : (mp_rank + 1) * self.slice_size
 95 |         ]
 96 |         self.register_buffer("slopes", slopes)
 97 | 
 98 |     def _get_slopes(self, n):
 99 |         """
100 |         Get slopes for Alibi positional embedding
101 |         n : int = number of heads.
102 |         For best performance, restrict n to a power of 2.
103 |         """
104 | 
105 |         def get_slopes_power_of_2(n):
106 |             start = 2 ** (-(2 ** -(math.log2(n) - 3)))
107 |             ratio = start
108 |             return [start * ratio ** i for i in range(n)]
109 | 
110 |         if math.log2(n).is_integer():
111 |             return get_slopes_power_of_2(n)
112 |         else:
113 |             closest_power_of_2 = 2 ** math.floor(math.log2(n))
114 |             return (
115 |                 get_slopes_power_of_2(closest_power_of_2)
116 |                 + self._get_slopes(2 * closest_power_of_2)[0::2][
117 |                     : n - closest_power_of_2
118 |                 ]
119 |             )
120 | 
121 |     def forward(self, x):
122 |         # [b, np, sq, sk]
123 |         seq_len_q = x.shape[-2]
124 |         seq_len_k = x.shape[-1]
125 |         if self.cached_seq_len != seq_len_k:
126 |             a = -torch.tril(
127 |                 torch.arange(seq_len_k).view(seq_len_k, 1).repeat(1, seq_len_k)
128 |                 + torch.arange(0, -seq_len_k, -1)
129 |             )
130 |             a = a.to(x.device).to(x.dtype)
131 |             slopes = self.slopes.to(a.device).to(a.dtype)
132 |             a = a * slopes.view(self.slopes.shape[0], 1, 1)
133 |             self.cached_seq_len = seq_len_k
134 |             self.cached_matrix = a
135 |         else:
136 |             a = self.cached_matrix
137 | 
138 |         if seq_len_q != seq_len_k:
139 |             # In the train case x has dimensionality [b, np, sq, sk] with sq == sk
140 |             # The number of query tokens is equal to the number of key tokens
141 |             # At inference time with cache in layer_past sq is not equal to sk. sq only contains one token (the last one in the full sequence)
142 |             # In this case we use the appropriate token index of the cache matrix.
143 |             # As the cache matrix could already be bigger from a past inference, not the last token index in the sq sequence is used
144 |             assert (
145 |                 seq_len_q == 1
146 |             ), "assumption sq == sk unless at inference time with cache in layer_past with sq == 1"
147 |             a = a[:, seq_len_k - 1, :].view(
148 |                 a.shape[0], 1, a.shape[2]
149 |             )  # seq_len_k - 1 points to the last token index in the current inference batch.
150 | 
151 |         return x + a
152 | 


--------------------------------------------------------------------------------
/megatron/data/samplers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Batch samplers that work with either random or sequential data samplers."""
 16 | 
 17 | import torch
 18 | from torch.utils import data
 19 | 
 20 | 
 21 | class RandomSampler(data.sampler.Sampler):
 22 |     """Based off of pytorch RandomSampler and DistributedSampler. Essentially
 23 |     a RandomSampler, but this class lets the user set an epoch like
 24 |     DistributedSampler Samples elements randomly. If without replacement, then
 25 |     sample from a shuffled dataset. If with replacement, then user can
 26 |     specify ``num_samples`` to draw.
 27 |     Arguments:
 28 |         data_source (Dataset): dataset to sample from
 29 |         num_samples (int): number of samples to draw, default=len(dataset)
 30 |         replacement (bool): samples are drawn with replacement if ``True``,
 31 |         default=False
 32 |     """
 33 | 
 34 |     def __init__(self, data_source, replacement=False, num_samples=None):
 35 |         self.data_source = data_source
 36 |         self.replacement = replacement
 37 |         self._num_samples = num_samples
 38 |         self.epoch = -1
 39 | 
 40 |         if self._num_samples is not None and replacement is False:
 41 |             raise ValueError(
 42 |                 "With replacement=False, num_samples should not "
 43 |                 "be specified, since a random permute will be "
 44 |                 "performed."
 45 |             )
 46 | 
 47 |         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
 48 |             raise ValueError(
 49 |                 "num_samples should be a positive integer "
 50 |                 "value, but got num_samples={}".format(self.num_samples)
 51 |             )
 52 |         if not isinstance(self.replacement, bool):
 53 |             raise ValueError(
 54 |                 "replacement should be a boolean value, but got "
 55 |                 "replacement={}".format(self.replacement)
 56 |             )
 57 | 
 58 |     @property
 59 |     def num_samples(self):
 60 |         # dataset size might change at runtime
 61 |         if self._num_samples is None:
 62 |             return len(self.data_source)
 63 |         return self._num_samples
 64 | 
 65 |     def __iter__(self):
 66 |         n = len(self.data_source)
 67 |         g = torch.Generator()
 68 |         if self.epoch >= 0:
 69 |             g.manual_seed(self.epoch)
 70 |         if self.replacement:
 71 |             return iter(
 72 |                 torch.randint(
 73 |                     high=n, size=(self.num_samples,), dtype=torch.int64, generator=g
 74 |                 ).tolist()
 75 |             )
 76 |         return iter(torch.randperm(n, generator=g).tolist())
 77 | 
 78 |     def __len__(self):
 79 |         return self.num_samples
 80 | 
 81 |     def set_epoch(self, epoch):
 82 |         self.epoch = epoch
 83 | 
 84 | 
 85 | class DistributedBatchSampler(data.sampler.BatchSampler):
 86 |     """Similar to normal implementation of distributed sampler, except
 87 |     implementation is at the batch sampler level, instead of just the
 88 |     sampler level. This allows wrapping of arbitrary data samplers
 89 |     (sequential, random, WeightedRandomSampler, etc.) with this batch
 90 |     sampler.
 91 | 
 92 |     The `interleave` argument specifies how to distribute a batch. A value
 93 |     of True combined with the above random sampler is equivalent to pytorch's
 94 |     torch.utils.data.distributed.DistributedSampler.
 95 | 
 96 |     For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
 97 |     specifying True will result in the following samples for each gpu:
 98 |         GPU0: [0,2,4,6] GPU1: [1,3,5,7]
 99 |     specifying False will result in the following samples:
100 |         GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
101 | 
102 |     def __init__(
103 |         self,
104 |         sampler,
105 |         batch_size,
106 |         drop_last,
107 |         rank=-1,
108 |         world_size=2,
109 |         wrap_last=False,
110 |         interleave=False,
111 |     ):
112 |         super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
113 |         if rank == -1:
114 |             assert False, "should not be here"
115 |             rank = torch.distributed.get_rank()
116 |         self.rank = rank
117 |         self.world_size = world_size
118 |         self.sampler.wrap_around = 0
119 |         self.wrap_around = 0
120 |         self.wrap_last = wrap_last
121 |         self.start_iter = 0
122 |         self.interleave = interleave
123 | 
124 |     def __iter__(self):
125 |         batch = []
126 |         i = 0
127 |         for idx in self.data_iterator(self.sampler, wrap_around=False):
128 |             batch.append(idx)
129 |             if len(batch) == self.batch_size:
130 |                 tbatch = self._batch(batch)
131 |                 if i >= self.start_iter:
132 |                     yield tbatch
133 |                     self.start_iter = 0
134 |                 i += 1
135 |                 batch = []
136 |         batch_len = len(batch)
137 |         if batch_len > 0 and not self.drop_last:
138 |             if self.wrap_last:
139 |                 self.sampler.wrap_around -= self.batch_size
140 |                 self.wrap_around += len(batch)
141 |                 self.wrap_around %= self.batch_size
142 |             yield self._batch(batch)
143 |         if self.wrap_last:
144 |             self.sampler.wrap_around += self.batch_size
145 | 
146 |     def data_iterator(self, _iter, wrap_around=False):
147 |         """iterates through data and handles wrap around"""
148 |         for i, idx in enumerate(_iter):
149 |             if i < self.wrap_around % self.batch_size:
150 |                 continue
151 |             if wrap_around:
152 |                 self.wrap_around += 1
153 |                 self.wrap_around %= self.batch_size
154 |             yield idx
155 | 
156 |     def _batch(self, batch):
157 |         """extracts samples only pertaining to this worker's batch"""
158 |         if self.interleave:
159 |             return batch[self.rank : self.batch_size : self.world_size]
160 |         start = self.rank * self.batch_size // self.world_size
161 |         end = (self.rank + 1) * self.batch_size // self.world_size
162 |         return batch[start:end]
163 | 


--------------------------------------------------------------------------------
/megatron/neox_arguments/deepspeed_args.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | try:
  4 |     from .template import NeoXArgsTemplate
  5 | except ImportError:
  6 |     from template import NeoXArgsTemplate
  7 | 
  8 | 
  9 | @dataclass
 10 | class NeoXArgsDeepspeedConfig(NeoXArgsTemplate):
 11 |     """
 12 |     Args for deepspeed config
 13 |     Every argument included here will be included in deepspeed config json
 14 |     #TODO this list is not complete as compared to https://www.deepspeed.ai/docs/config-json/
 15 |     """
 16 | 
 17 |     deepspeed: bool = True
 18 |     """boolean flag to enable DeepSpeed (Always True)"""
 19 | 
 20 |     train_batch_size: int = None
 21 |     """
 22 |     The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
 23 |     """
 24 | 
 25 |     train_micro_batch_size_per_gpu: int = None
 26 |     """
 27 |     Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON.
 28 |     """
 29 | 
 30 |     gradient_accumulation_steps: int = 1
 31 |     """
 32 |     Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with train_step_batch_size in the configuration JSON.
 33 |     """
 34 | 
 35 |     optimizer: dict = None
 36 |     """
 37 |     dict containing the keys type and params
 38 | 
 39 |     type: The optimizer name. DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
 40 | 
 41 |     params: Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for Adam).
 42 |     """
 43 | 
 44 |     scheduler: dict = None
 45 |     """
 46 |     dict containing the keys type and params
 47 | 
 48 |     type: The scheduler name. See here (https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers.
 49 | 
 50 |     params: Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.
 51 |     """
 52 | 
 53 |     fp32_allreduce: bool = False
 54 |     """
 55 |     During gradient averaging perform allreduce with 32 bit values
 56 |     """
 57 | 
 58 |     prescale_gradients: bool = False
 59 |     """
 60 |     Scale gradients before doing allreduce
 61 |     """
 62 | 
 63 |     gradient_predivide_factor: float = 1.0
 64 |     """
 65 |     Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs
 66 |     """
 67 | 
 68 |     sparse_gradients: bool = False
 69 |     """
 70 |     Enable sparse compression of torch.nn.Embedding gradients.
 71 |     """
 72 | 
 73 |     fp16: dict = None
 74 |     """
 75 |     Configuration for using mixed precision/FP16 training that leverages NVIDIA’s Apex package.
 76 |     """
 77 | 
 78 |     amp: dict = None
 79 |     """
 80 |     Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options
 81 |     """
 82 | 
 83 |     gradient_clipping: float = 0.0
 84 |     """
 85 |     Enable gradient clipping with provided value
 86 |     """
 87 | 
 88 |     zero_optimization: dict = None
 89 |     """"""
 90 | 
 91 |     steps_per_print: int = 10
 92 |     """
 93 |     Print train loss every N steps.
 94 |     """
 95 | 
 96 |     wall_clock_breakdown: bool = False
 97 |     """
 98 |     Enable timing of the latency of forward/backward/update training phases.
 99 |     """
100 | 
101 |     dump_state: bool = False
102 |     """
103 |     Print out state information of DeepSpeed object after initialization.
104 |     """
105 | 
106 |     flops_profiler: dict = None
107 |     """
108 |     Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#flops-profiler
109 |     """
110 | 
111 |     zero_allow_untested_optimizer: bool = False
112 |     """
113 |     Whether Deepspeed Zero Optimizer will allow an optimizer that hasn't been tested by the deepspeed team
114 |     """
115 | 
116 | 
117 | @dataclass
118 | class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
119 |     """
120 |     Args for deepspeed runner (deepspeed.launcher.runner).
121 |     Every argument included here will be passed as command line argument to deepspeed.launcher.runner
122 |     """
123 | 
124 |     hostfile: str = None
125 |     """
126 |     list of hostnames / ssh aliases and the number of GPUs per host
127 | 
128 |     example file contents:
129 |     worker-1 slots=4
130 |     worker-2 slots=4
131 |     127.0.0 slots=4
132 |     127.0.1 slots=4
133 |     """
134 | 
135 |     include: str = None
136 |     """
137 |     Specify hardware resources to use during execution. String format is `NODE_SPEC[@NODE_SPEC ...]` where `NODE_SPEC=NAME[:SLOT[,SLOT ...]]`. If `:SLOT` is omitted, include all slots on that host. Example: `"worker-0@worker-1:0,2"` will use all slots. on `worker-0` and slots `[0, 2]` on `worker-1`.
138 |     """
139 | 
140 |     exclude: str = None
141 |     """
142 |     Specify hardware resources to NOT use during execution. Same format as include
143 |     """
144 | 
145 |     num_nodes: int = -1
146 |     """
147 |     Total number of worker nodes to run on, this will use the top N hosts from the given hostfile. -1 will use all.
148 |     """
149 | 
150 |     num_gpus: int = None
151 |     """
152 |     Max number of GPUs to use on each node, will use [0:N) GPU ids on each node. None / not specifying a value will use all.
153 |     """
154 | 
155 |     master_port: int = 29500
156 |     """
157 |     Port used by PyTorch distributed for communication during training.
158 |     """
159 | 
160 |     master_addr: str = None
161 |     """
162 |     IP address of node 0, will be inferred via 'hostname -I' if not specified.
163 |     """
164 | 
165 |     launcher: str = "pdsh"
166 |     """
167 |     Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
168 |     """
169 | 
170 |     detect_nvlink_pairs: bool = False
171 |     """
172 |     If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
173 |     """
174 | 


--------------------------------------------------------------------------------