├── tests ├── __init__.py ├── pytest.ini ├── neox_args │ ├── __init__.py │ ├── test_neoxargs_implementation.py │ ├── test_neoxargs_usage.py │ ├── test_neoxargs_commandline.py │ └── test_neoxargs_load.py ├── model │ ├── __init__.py │ ├── test_model_instantiation.py │ ├── test_model_generation.py │ ├── test_model_checkpoint.py │ └── test_model_train.py ├── Readme.md └── test_configs │ └── test_train_base.yml ├── tools ├── kill.sh ├── killall.sh ├── sync_cmd.sh ├── sync.sh └── syncdir.sh ├── CODEOWNERS ├── megatron ├── data │ ├── __init__.py │ ├── Makefile │ ├── blendable_dataset.py │ └── samplers.py ├── gradient_noise_scale │ └── __init__.py ├── tokenizer │ ├── __init__.py │ └── train_tokenizer.py ├── model │ ├── __init__.py │ ├── fused_bias_dropout.py │ ├── norms.py │ ├── init_functions.py │ ├── activations.py │ ├── gmlp.py │ └── positional_embeddings.py ├── fused_kernels │ ├── compat.h │ ├── __init__.py │ ├── setup.py │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── scaled_masked_softmax_cuda.cu │ └── type_shim.h ├── mpu │ ├── random.py │ ├── __init__.py │ ├── utils.py │ ├── data.py │ ├── cross_entropy.py │ └── mappings.py ├── __init__.py ├── neox_arguments │ ├── template.py │ ├── __init__.py │ └── deepspeed_args.py └── learning_rates.py ├── requirements ├── requirements-sparseattention.txt ├── requirements-onebitadam.txt ├── requirements-tensorboard.txt ├── requirements-dev.txt └── requirements.txt ├── MANIFEST.in ├── eval_tasks └── __init__.py ├── test.py ├── myscripts └── tokenizer_downloads.sh ├── .github ├── workflows │ ├── pull_request.yml │ └── docker_build.yml └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── create_tokenizer.py ├── configs ├── text_generation.yml ├── sparse.yml ├── eleutherai_cluster.yml ├── myconfigs │ ├── data_config.yml │ ├── local_setup.yml │ ├── small.yml │ ├── model_config.yml │ └── 20B.yml ├── local_setup.yml ├── gmlp_small.yml ├── small.yml ├── 13B.yml ├── XL.yml ├── 175B.yml ├── 2-7B.yml ├── 6-7B.yml ├── large.yml ├── medium.yml ├── bnb_small.yml ├── small_bf16.yml ├── 20B.yml └── gen_docs.py ├── .pre-commit-config.yaml ├── train.py ├── deepy.py ├── prepare_data.py ├── CITATION.cff ├── evaluate.py ├── .gitignore ├── generate.py ├── Dockerfile └── .clang-format /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/kill.sh: -------------------------------------------------------------------------------- 1 | pkill -9 python 2 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @EleutherAI/pm-gptneo 2 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /requirements/requirements-sparseattention.txt: -------------------------------------------------------------------------------- 1 | triton==0.4.2 2 | -------------------------------------------------------------------------------- /requirements/requirements-onebitadam.txt: -------------------------------------------------------------------------------- 1 | cupy-cuda111==8.6.0 2 | -------------------------------------------------------------------------------- /requirements/requirements-tensorboard.txt: -------------------------------------------------------------------------------- 1 | tensorboard==2.5.0 2 | -------------------------------------------------------------------------------- /tools/killall.sh: -------------------------------------------------------------------------------- 1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py' 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /eval_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness 2 | -------------------------------------------------------------------------------- /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | cpu: marks tests that can be run on cpu 4 | -------------------------------------------------------------------------------- /megatron/gradient_noise_scale/__init__.py: -------------------------------------------------------------------------------- 1 | from .gradient_noise_scale import GradientNoiseScale 2 | -------------------------------------------------------------------------------- /requirements/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8==1.5.6 2 | pytest==6.2.3 3 | pytest-cov==2.11.1 4 | pytest-forked==1.3.0 5 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | 3 | tokenizer = AutoTokenizer.from_pretrained('downloads/20B_tokenizer.json') -------------------------------------------------------------------------------- /tests/neox_args/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | testing of implementation of command line arguments and configuration (NeoXArgs) 3 | """ 4 | -------------------------------------------------------------------------------- /tests/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .test_model_instantiation import run_test_model_instantiation 2 | from .test_model_train import run_train_test 3 | from .test_model_checkpoint import run_checkpoint_test 4 | -------------------------------------------------------------------------------- /tools/sync_cmd.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Runs a command in parallel across all nodes 4 | # Usage 5 | # sync_cmd.sh 'echo "hello world"' 6 | 7 | echo "Command: $1"; 8 | pdsh -R ssh -w ^/job/hosts $1 9 | -------------------------------------------------------------------------------- /myscripts/tokenizer_downloads.sh: -------------------------------------------------------------------------------- 1 | mkdir data 2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 3 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt 4 | mv gpt2-vocab.json data 5 | mv gpt2-merges.txt data -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Pull Request 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-python@v2 11 | with: 12 | python-version: 3.8 13 | - uses: pre-commit/action@v2.0.3 14 | -------------------------------------------------------------------------------- /tools/sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Push files to all nodes 4 | # Usage 5 | # sync.sh file [file2..] 6 | 7 | echo Number of files to upload: $# 8 | 9 | for file in "$@" 10 | do 11 | full_path=$(realpath $file) 12 | echo Uploading $full_path 13 | pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path 14 | done 15 | -------------------------------------------------------------------------------- /tools/syncdir.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Push files to all nodes 4 | # Usage 5 | # sync.sh file [file2..] 6 | 7 | echo Number of files to upload: $# 8 | 9 | for file in "$@" 10 | do 11 | full_path=$(realpath $file) 12 | parentdir="$(dirname "$full_path")" 13 | echo Uploading $full_path to $parentdir 14 | pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir 15 | done 16 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_implementation.py: -------------------------------------------------------------------------------- 1 | """ 2 | check implementation of NeoXArgs for duplication errors (would overwrite) 3 | """ 4 | import pytest 5 | 6 | 7 | @pytest.mark.cpu 8 | def test_neoxargs_duplicates(): 9 | """ 10 | tests that there are no duplicates among parent classes of NeoXArgs 11 | """ 12 | from megatron import NeoXArgs 13 | 14 | assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates" 15 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | git+git://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed 2 | einops==0.3.0 3 | ftfy==6.0.1 4 | lm_dataformat==0.0.19 5 | git+https://github.com/EleutherAI/lm-evaluation-harness.git@dc937d4b70af819c5695e09d94e59e4cdb1e40ad#egg=lm_eval 6 | mpi4py==3.0.3 7 | numpy==1.21.0 8 | pybind11==2.6.2 9 | regex 10 | sentencepiece 11 | six 12 | tokenizers==0.10.2 13 | transformers==4.5.0 14 | wandb==0.10.28 15 | -------------------------------------------------------------------------------- /create_tokenizer.py: -------------------------------------------------------------------------------- 1 | from tokenizers import Tokenizer 2 | from megatron.tokenizer.tokenizer import HFTokenizer 3 | 4 | 5 | filepath = 'data/tokenizer/20B_tokenizer.json' 6 | tokenizer = HFTokenizer(filepath) 7 | hello_ids = tokenizer.tokenize("hello") 8 | print(hello_ids) 9 | 10 | ids = tokenizer.tokenize('}{') 11 | print(ids) 12 | ids = tokenizer.tokenize('---') 13 | print(ids) 14 | start_ids = tokenizer.tokenize('<|startoftext|>') 15 | end_ids = tokenizer.tokenize('<|endoftext|>') 16 | print(start_ids) 17 | print(end_ids) -------------------------------------------------------------------------------- /configs/text_generation.yml: -------------------------------------------------------------------------------- 1 | # Parameters used for text generation 2 | # Make sure `load` is specified somewhere else 3 | { 4 | # Text gen type: `input-file`, `unconditional` or `interactive` 5 | "text-gen-type": "unconditional", 6 | 7 | # Params for all 8 | "maximum_tokens": 102, 9 | "temperature": 1.0, 10 | "top_p": .9, 11 | "top_k": 50, 12 | "recompute": false, 13 | 14 | # `unconditional`: samples 15 | "num-samples": 5, 16 | 17 | # input/output file 18 | "sample-input-file": "sample_input.txt", 19 | "sample-output-file": "sample_output.txt", 20 | } 21 | -------------------------------------------------------------------------------- /configs/sparse.yml: -------------------------------------------------------------------------------- 1 | # Add this to your config for sparse attention every other layer 2 | { 3 | "attention_config": [[["local", "global"], "all"]], 4 | 5 | # sparsity config: 6 | # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for 7 | # illustrative purposes) 8 | # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for 9 | # more detailed config instructions and available parameters 10 | 11 | "sparsity_config": { 12 | "block": 16, # block size 13 | "num_local_blocks": 32, 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: feature request 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .tokenizer import build_tokenizer 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Proposed solution** 24 | If you have an idea for how we can fix this problem, describe it here. 25 | 26 | **Screenshots** 27 | If applicable, add screenshots to help explain your problem. 28 | 29 | **Environment (please complete the following information):** 30 | - GPUs: 31 | - Configs: 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from .gpt2_model import GPT2ModelPipe 19 | from .utils import get_params_for_weight_decay_optimization 20 | from .word_embeddings import SoftEmbedding 21 | -------------------------------------------------------------------------------- /configs/eleutherai_cluster.yml: -------------------------------------------------------------------------------- 1 | # Data paths and options when using EleutherAI cluster 2 | { 3 | "data-path": "/mnt/ssd-1/data/enron/enron_text_document", 4 | # or for weighted datasets: 5 | # "train-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"], 6 | # "test-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"], 7 | # "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"], 8 | # "train-data-weights": [1., 2.], 9 | # "test-data-weights": [2., 1.], 10 | # "valid-data-weights": [0.5, 0.4], 11 | 12 | "vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json", 13 | "merge-file": "/mnt/ssd-1/data/gpt2-merges.txt", 14 | "save": "/mnt/ssd-1/checkpoints", 15 | "load": "/mnt/ssd-1/checkpoints", 16 | "tensorboard-dir": "/mnt/ssd-1/tensorboard", 17 | "log-dir": "/mnt/ssd-1/logs", 18 | "wandb_team": "eleutherai", 19 | } 20 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-json 7 | - id: check-symlinks 8 | - id: check-yaml 9 | - id: destroyed-symlinks 10 | - id: end-of-file-fixer 11 | exclude: docs/CNAME 12 | - id: fix-byte-order-marker 13 | - id: fix-encoding-pragma 14 | args: [--remove] 15 | - id: mixed-line-ending 16 | args: [--fix=lf] 17 | - id: requirements-txt-fixer 18 | - id: trailing-whitespace 19 | - repo: https://gitlab.com/daverona/pre-commit-cpp 20 | rev: 0.8.0 21 | hooks: 22 | - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available 23 | args: [] 24 | 25 | - repo: https://github.com/psf/black 26 | rev: 21.8b0 27 | hooks: 28 | - id: black 29 | language_version: python3.8 30 | -------------------------------------------------------------------------------- /megatron/mpu/random.py: -------------------------------------------------------------------------------- 1 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports 2 | # TODO: should be able to get rid of this file entirely 3 | 4 | import deepspeed 5 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing 6 | 7 | # Default name for the model parallel rng tracker. 8 | _MODEL_PARALLEL_RNG_TRACKER_NAME = ( 9 | deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME 10 | ) 11 | 12 | # Whether apply model parallelsim to checkpointed hidden states. 13 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None 14 | 15 | # RNG tracker object. 16 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER 17 | 18 | # Deepspeed checkpointing functions 19 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones 20 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state 21 | checkpoint = checkpointing.checkpoint 22 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed 23 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker 24 | -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | 17 | def print_rank_0(*message): 18 | """If distributed is initialized print only on rank 0.""" 19 | if torch.distributed.is_initialized(): 20 | if torch.distributed.get_rank() == 0: 21 | print(*message, flush=True) 22 | else: 23 | print(*message, flush=True) 24 | 25 | 26 | from .initialize import initialize_megatron 27 | from .neox_arguments import NeoXArgs 28 | -------------------------------------------------------------------------------- /megatron/neox_arguments/template.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import logging 3 | 4 | 5 | @dataclass 6 | class NeoXArgsTemplate: 7 | def defaults(self): 8 | """ 9 | generator for getting default values. 10 | """ 11 | for key, field_def in self.__dataclass_fields__.items(): 12 | yield key, field_def.default 13 | 14 | def update_value(self, key: str, value): 15 | """ 16 | updates a property value if the key already exists 17 | 18 | Problem: a previously non-existing property can be added to the class instance without error. 19 | """ 20 | if hasattr(self, key): 21 | setattr(self, key, value) 22 | else: 23 | error_message = ( 24 | self.__class__.__name__ 25 | + ".update_value() to be updated property " 26 | + str(key) 27 | + " does not exist" 28 | ) 29 | logging.error(error_message) 30 | raise ValueError(error_message) 31 | 32 | def update_values(self, d): 33 | """ 34 | Updates multiple values in self if the keys already exists 35 | """ 36 | for k, v in d.items(): 37 | self.update_value(k, v) 38 | -------------------------------------------------------------------------------- /configs/myconfigs/data_config.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data-path": "data/train_data/passage_crits_text", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 7 | # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 8 | # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab-file": "data/tokenizer/20B_tokenizer.json", 19 | 20 | "save": "checkpoints", 21 | "load": "checkpoints", 22 | "checkpoint_validation_with_forward_pass": False, 23 | 24 | "tensorboard-dir": "tensorboard", 25 | "log-dir": "logs", 26 | "use_wandb": True, 27 | "wandb_host": "https://api.wandb.ai", 28 | "wandb_project": "neox" 29 | } 30 | -------------------------------------------------------------------------------- /configs/local_setup.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data-path": "data/enron/enron_text_document", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 7 | # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 8 | # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab-file": "data/gpt2-vocab.json", 19 | "merge-file": "data/gpt2-merges.txt", 20 | 21 | "save": "checkpoints", 22 | "load": "checkpoints", 23 | "checkpoint_validation_with_forward_pass": False, 24 | 25 | "tensorboard-dir": "tensorboard", 26 | "log-dir": "logs", 27 | "use_wandb": True, 28 | "wandb_host": "https://api.wandb.ai", 29 | "wandb_project": "neox" 30 | } 31 | -------------------------------------------------------------------------------- /configs/myconfigs/local_setup.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data-path": "data/enron/enron_text_document", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 7 | # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 8 | # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab-file": "data/gpt2-vocab.json", 19 | "merge-file": "data/gpt2-merges.txt", 20 | 21 | "save": "checkpoints", 22 | "load": "checkpoints", 23 | "checkpoint_validation_with_forward_pass": False, 24 | 25 | "tensorboard-dir": "tensorboard", 26 | "log-dir": "logs", 27 | "use_wandb": True, 28 | "wandb_host": "https://api.wandb.ai", 29 | "wandb_project": "neox" 30 | } 31 | -------------------------------------------------------------------------------- /tests/Readme.md: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | 3 | Tests use pytests with coverage and forked plugins. Install with: 4 | 5 | ```bash 6 | pip install -r requirements/requirements-dev.txt 7 | ``` 8 | 9 | # Run 10 | 11 | Tests can be run using pytest. 12 | 13 | * The argument --forked needs to be provided 14 | * A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation) 15 | * A subset of tests can be selected by pointing to the module within tests 16 | 17 | ```bash 18 | # run all tests, output coverage report of megatron module in terminal 19 | pytest --forked --cov-report term --cov=megatron tests 20 | 21 | # run tests in tests/model, output coverage report of megatron module as html 22 | pytest --forked --cov-report html --cov=megatron tests/model 23 | 24 | # run tests in tests/model/test_model_generation.py, don't output coverage report 25 | pytest --forked tests/model/test_model_generation.py 26 | ``` 27 | 28 | Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu. 29 | The test cases for cpu can be run with: 30 | ```` 31 | pytest tests -m cpu 32 | ``` 33 | 34 | If a html coverage report has been created a simple http server can be run to serve static files. 35 | 36 | ```bash 37 | python -m http.server --directory htmlcov 8000 38 | ``` 39 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI contributors 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Pretrain""" 19 | from megatron.neox_arguments import NeoXArgs 20 | from megatron.training import pretrain 21 | 22 | if __name__ == "__main__": 23 | neox_args = NeoXArgs.consume_neox_args() 24 | neox_args.configure_distributed_args() 25 | neox_args.build_tokenizer() # tokenizer needs to be build in training in order to set the padding vocab 26 | neox_args.initialize_tensorboard_writer() # is initialized if tensorboard directory is defined 27 | pretrain(neox_args=neox_args) 28 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typing import Optional 4 | from torch import Tensor 5 | 6 | # flags required to enable jit fusion kernels 7 | torch._C._jit_set_profiling_mode(False) 8 | torch._C._jit_set_profiling_executor(False) 9 | torch._C._jit_override_can_fuse_on_cpu(True) 10 | torch._C._jit_override_can_fuse_on_gpu(True) 11 | 12 | 13 | def bias_dropout_add( 14 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool 15 | ) -> Tensor: 16 | out = torch.nn.functional.dropout(x + bias, p=prob, training=training) 17 | if residual is not None: 18 | out = residual + out 19 | return out 20 | 21 | 22 | def get_bias_dropout_add(training): 23 | def _bias_dropout_add(x, bias, residual, prob): 24 | return bias_dropout_add(x, bias, residual, prob, training) 25 | 26 | return _bias_dropout_add 27 | 28 | 29 | @torch.jit.script 30 | def bias_dropout_add_fused_train( 31 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 32 | ) -> Tensor: 33 | return bias_dropout_add(x, bias, residual, prob, True) 34 | 35 | 36 | @torch.jit.script 37 | def bias_dropout_add_fused_inference( 38 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 39 | ) -> Tensor: 40 | return bias_dropout_add(x, bias, residual, prob, False) 41 | -------------------------------------------------------------------------------- /deepy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2021, EleutherAI contributors 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | 19 | import deepspeed 20 | from deepspeed.launcher.runner import main 21 | 22 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) 23 | 24 | from megatron.neox_arguments import NeoXArgs 25 | from megatron.utils import get_wandb_api_key 26 | 27 | 28 | neox_args = NeoXArgs.consume_deepy_args() 29 | deepspeed_main_args = neox_args.get_deepspeed_main_args() 30 | 31 | # Extract wandb API key and inject into worker environments 32 | wandb_token = get_wandb_api_key(neox_args=neox_args) 33 | if wandb_token is not None: 34 | deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY") 35 | os.environ["WANDB_API_KEY"] = wandb_token 36 | 37 | if __name__ == "__main__": 38 | main(deepspeed_main_args) 39 | -------------------------------------------------------------------------------- /.github/workflows/docker_build.yml: -------------------------------------------------------------------------------- 1 | name: docker_build 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | main: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - 13 | name: Checkout 14 | uses: actions/checkout@v2 15 | 16 | - 17 | name: Docker meta 18 | id: docker_meta 19 | uses: crazy-max/ghaction-docker-meta@v1 20 | with: 21 | images: leogao2/gpt-neox # list of Docker images to use as base name for tags 22 | tag-sha: true # add git short SHA as Docker tag 23 | 24 | - 25 | name: Set up QEMU 26 | uses: docker/setup-qemu-action@v1 27 | 28 | - 29 | name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v1 31 | 32 | - 33 | name: Login to DockerHub 34 | uses: docker/login-action@v1 35 | with: 36 | username: ${{ secrets.DOCKERHUB_USERNAME }} 37 | password: ${{ secrets.DOCKERHUB_TOKEN }} 38 | 39 | - 40 | name: Build and push 41 | id: docker_build 42 | uses: docker/build-push-action@v2 43 | with: 44 | push: ${{ github.event_name != 'pull_request' }} 45 | tags: ${{ steps.docker_meta.outputs.tags }} 46 | labels: ${{ steps.docker_meta.outputs.labels }} 47 | 48 | - 49 | name: Image digest 50 | run: echo ${{ steps.docker_build.outputs.digest }} 51 | -------------------------------------------------------------------------------- /megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pathlib 17 | import subprocess 18 | 19 | from torch.utils import cpp_extension 20 | from pathlib import Path 21 | 22 | srcpath = Path(__file__).parent.absolute() 23 | 24 | # Setting this param to a list has a problem of generating different 25 | # compilation commands (with diferent order of architectures) and 26 | # leading to recompilation of fused kernels. Set it to empty string 27 | # to avoid recompilation and assign arch flags explicity in 28 | # extra_cuda_cflags below 29 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 30 | 31 | def load_fused_kernels(): 32 | try: 33 | import scaled_upper_triang_masked_softmax_cuda 34 | import scaled_masked_softmax_cuda 35 | except (ImportError, ModuleNotFoundError): 36 | print("\n") 37 | print("=" * 100) 38 | print(f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them') 39 | print("=" * 100) 40 | exit() 41 | return 42 | -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | from tools.corpora import prepare_dataset, DATA_DOWNLOADERS 2 | import argparse 3 | 4 | TOKENIZER_CHOICES = [ 5 | "HFGPT2Tokenizer", 6 | "HFTokenizer", 7 | "GPT2BPETokenizer", 8 | "CharLevelTokenizer", 9 | ] 10 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"] 11 | 12 | 13 | def get_args(): 14 | parser = argparse.ArgumentParser(description="Download & preprocess neox datasets") 15 | parser.add_argument( 16 | "dataset", 17 | nargs="?", 18 | default="enron", 19 | help="name of dataset to download.", 20 | choices=DATASET_CHOICES, 21 | ) 22 | parser.add_argument( 23 | "-t", 24 | "--tokenizer", 25 | default="GPT2BPETokenizer", 26 | choices=TOKENIZER_CHOICES, 27 | help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}', 28 | ) 29 | parser.add_argument( 30 | "-d", 31 | "--data-dir", 32 | default=None, 33 | help=f"Directory to which to download datasets / tokenizer " 34 | f"files - defaults to ./data", 35 | ) 36 | parser.add_argument( 37 | "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)" 38 | ) 39 | parser.add_argument( 40 | "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)" 41 | ) 42 | return parser.parse_args() 43 | 44 | 45 | if __name__ == "__main__": 46 | args = get_args() 47 | prepare_dataset( 48 | dataset_name=args.dataset, 49 | tokenizer_type=args.tokenizer, 50 | data_dir=args.data_dir, 51 | vocab_file=args.vocab_file, 52 | merge_file=args.merge_file, 53 | ) 54 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # YAML 1.2 2 | --- 3 | authors: 4 | - affiliation: EleutherAI 5 | family-names: Andonian 6 | given-names: Alex 7 | - affiliation: EleutherAI 8 | family-names: Biderman 9 | given-names: Stella 10 | - affiliation: EleutherAI 11 | family-names: Black 12 | given-names: Sid 13 | - affiliation: EleutherAI 14 | family-names: Gali 15 | given-names: Preetham 16 | - affiliation: EleutherAI 17 | family-names: Gao 18 | given-names: Leo 19 | - affiliation: EleutherAI 20 | family-names: Hallahan 21 | given-names: Eric 22 | - affiliation: EleutherAI 23 | family-names: Levy-Kramer 24 | given-names: Josh 25 | - affiliation: EleutherAI 26 | family-names: Leahy 27 | given-names: Connor 28 | - affiliation: EleutherAI 29 | family-names: Nestler 30 | given-names: Lucas 31 | - affiliation: EleutherAI 32 | family-names: Parker 33 | given-names: Kip 34 | - affiliation: EleutherAI 35 | family-names: Pieler 36 | given-names: Michael 37 | - affiliation: EleutherAI 38 | family-names: Purohit 39 | given-names: Shivanshu 40 | - affiliation: EleutherAI 41 | family-names: Songz 42 | given-names: Tri 43 | - affiliation: EleutherAI 44 | family-names: Phil 45 | given-names: Wang 46 | - affiliation: EleutherAI 47 | family-names: Weinbach 48 | given-names: Samuel 49 | cff-version: "1.1.0" 50 | keywords: 51 | - Transformers 52 | - "Massive language model" 53 | - "Autoregressive language model" 54 | license: "Apache-2.0" 55 | message: "If you use this software, please cite it using these metadata." 56 | repository-code: "https://www.github.com/eleutherai/gpt-neox" 57 | title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch" 58 | version: "0.0.1" 59 | date-released: 2021-08-23 60 | ... 61 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI contributors 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Evaluation tasks - modified from https://github.com/EleutherAI/lm-evaluation-harness""" 19 | 20 | import os 21 | import sys 22 | 23 | sys.path.append( 24 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 25 | ) 26 | from megatron.training import forward_step 27 | from megatron.utils import setup_for_inference_or_eval 28 | from eval_tasks import run_eval_harness 29 | from pprint import pprint 30 | from datetime import datetime 31 | import json 32 | 33 | 34 | def main(): 35 | model, neox_args = setup_for_inference_or_eval(use_cache=False) 36 | results = run_eval_harness( 37 | model, 38 | forward_step, 39 | neox_args, 40 | eval_tasks=neox_args.eval_tasks, 41 | bootstrap_iters=10000, 42 | ) 43 | if neox_args.rank == 0: 44 | pprint(results) 45 | results_path = ( 46 | f'eval_results_{datetime.now().strftime("%m-%d-%Y-%H-%M-%S")}.json' 47 | ) 48 | if neox_args.eval_results_prefix: 49 | results_path = f"{neox_args.eval_results_prefix}_{results_path}" 50 | with open(results_path, "w") as f: 51 | json.dump(results, f, indent=4) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /configs/gmlp_small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | "attention_config": [[["gmlp"], "all"]], 8 | 9 | 10 | # model settings 11 | "num-layers": 12, 12 | "hidden-size": 768, # gmlp d_ff defaults to hidden_size * 4 13 | "gmlp_attn_dim": 64, 14 | "num-attention-heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention. 15 | "seq-length": 2048, 16 | "max-position-embeddings": 2048, 17 | "norm": "layernorm", 18 | "pos-emb": "none", 19 | "no-weight-tying": true, 20 | 21 | # optimizer settings 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0006, 26 | "betas": [0.9, 0.999], 27 | "eps": 1.0e-8, 28 | } 29 | }, 30 | 31 | # batch / data settings 32 | "train_micro_batch_size_per_gpu": 4, 33 | "data-impl": "mmap", 34 | "split": "949,50,1", 35 | 36 | # activation checkpointing 37 | "checkpoint-activations": true, 38 | "checkpoint-num-layers": 1, 39 | "partition-activations": false, 40 | "synchronize-each-layer": true, 41 | 42 | # regularization 43 | "gradient_clipping": 1.0, 44 | "weight-decay": 0.1, 45 | "hidden-dropout": 0.0, 46 | "attention-dropout": 0.0, 47 | 48 | # precision settings 49 | "fp16": { 50 | "enabled": true, 51 | "loss_scale": 0, 52 | "loss_scale_window": 1000, 53 | "hysteresis": 2, 54 | "min_loss_scale": 1 55 | }, 56 | 57 | # misc. training settings 58 | "train-iters": 320000, 59 | "lr-decay-iters": 320000, 60 | "distributed-backend": "nccl", 61 | "lr-decay-style": "cosine", 62 | "warmup": 0.01, 63 | "save-interval": 10000, 64 | "eval-interval": 1000, 65 | "eval-iters": 10, 66 | 67 | # logging 68 | "log-interval": 100, 69 | "steps_per_print": 10, 70 | "keep-last-n-checkpoints": 4, 71 | "wall_clock_breakdown": true, 72 | } 73 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_usage.py: -------------------------------------------------------------------------------- 1 | """ 2 | plausibility check for the usage of neox_args in the megatron codebase 3 | """ 4 | import pytest 5 | import re 6 | from ..common import get_root_directory 7 | 8 | 9 | @pytest.mark.cpu 10 | def test_neoxargs_usage(): 11 | """ " 12 | checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs 13 | """ 14 | from megatron.neox_arguments import NeoXArgs 15 | 16 | declared_all = True 17 | neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys()) 18 | 19 | # we exlude a number of properties (implemented with the @property decorator) or functions that we know exists 20 | exclude = set( 21 | [ 22 | "params_dtype", 23 | "deepspeed_config", 24 | "get", 25 | "pop", 26 | "get_deepspeed_main_args", 27 | 'optimizer["params"]', 28 | "attention_config[layer_number]", 29 | "adlr_autoresume_object", 30 | "update_value", 31 | "all_config", 32 | "tensorboard_writer", 33 | "tokenizer", 34 | "train_batch_size]", 35 | ] 36 | ) 37 | 38 | # test file by file 39 | for filename in (get_root_directory() / "megatron").glob("**/*.py"): 40 | if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]: 41 | continue 42 | 43 | # load file 44 | with open(filename, "r") as f: 45 | file_contents = f.read() 46 | 47 | # find args matches 48 | matches = list( 49 | re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=])", file_contents) 50 | ) 51 | if len(matches) == 0: 52 | continue 53 | 54 | # compare 55 | for match in matches: 56 | if match not in neox_args_attributes and match not in exclude: 57 | print( 58 | f"(arguments used not found in neox args): {filename.name}: {match}", 59 | flush=True, 60 | ) 61 | declared_all = False 62 | 63 | assert declared_all, "all arguments used in code defined in NeoXArgs" 64 | -------------------------------------------------------------------------------- /configs/small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.0, 57 | "hidden-dropout": 0.0, 58 | "attention-dropout": 0.0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "save-interval": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "keep-last-n-checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Model parallel utility interface.""" 16 | 17 | from .cross_entropy import vocab_parallel_cross_entropy 18 | 19 | from .data import broadcast_data 20 | 21 | from .initialize import is_unitialized 22 | from .initialize import destroy_model_parallel 23 | from .initialize import get_data_parallel_group 24 | from .initialize import get_data_parallel_rank 25 | from .initialize import get_data_parallel_world_size 26 | from .initialize import get_model_parallel_group 27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank 29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 30 | from .initialize import get_topology 31 | from .initialize import get_pipe_parallel_group 32 | from .initialize import get_pipe_parallel_rank 33 | from .initialize import get_pipe_parallel_world_size 34 | from .initialize import get_io_parallel_group 35 | from .initialize import initialize_model_parallel 36 | from .initialize import model_parallel_is_initialized 37 | 38 | from .layers import ColumnParallelLinear 39 | from .layers import RowParallelLinear 40 | from .layers import VocabParallelEmbedding 41 | from .layers import ParallelRelativePositionBias 42 | 43 | from .mappings import copy_to_model_parallel_region 44 | from .mappings import gather_from_model_parallel_region 45 | from .mappings import reduce_from_model_parallel_region 46 | from .mappings import scatter_to_model_parallel_region 47 | 48 | from .random import checkpoint 49 | from .random import get_cuda_rng_tracker 50 | from .random import model_parallel_cuda_manual_seed 51 | 52 | from .utils import divide 53 | from .utils import split_tensor_along_last_dim 54 | -------------------------------------------------------------------------------- /configs/13B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 40, 10 | "hidden-size": 5120, 11 | "num-attention-heads": 40, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | # optimizer settings 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.0001, 27 | "betas": [0.9, 0.999], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | "zero_optimization": { 32 | "stage": 1, 33 | "allgather_partitions": True, 34 | "allgather_bucket_size": 500000000, 35 | "overlap_comm": True, 36 | "reduce_scatter": True, 37 | "reduce_bucket_size": 500000000, 38 | "contiguous_gradients": True, 39 | "cpu_offload": False 40 | }, 41 | 42 | # batch / data settings 43 | "train_micro_batch_size_per_gpu": 4, 44 | "data-impl": "mmap", 45 | "split": "949,50,1", 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "save-interval": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "keep-last-n-checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /configs/XL.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 24, 10 | "hidden-size": 2048, 11 | "num-attention-heads": 16, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | # optimizer settings 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.0002, 27 | "betas": [0.9, 0.999], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | "zero_optimization": { 32 | "stage": 1, 33 | "allgather_partitions": True, 34 | "allgather_bucket_size": 500000000, 35 | "overlap_comm": True, 36 | "reduce_scatter": True, 37 | "reduce_bucket_size": 500000000, 38 | "contiguous_gradients": True, 39 | "cpu_offload": False 40 | }, 41 | 42 | # batch / data settings 43 | "train_micro_batch_size_per_gpu": 4, 44 | "data-impl": "mmap", 45 | "split": "949,50,1", 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "save-interval": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "keep-last-n-checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /configs/myconfigs/small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.0, 57 | "hidden-dropout": 0.0, 58 | "attention-dropout": 0.0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "save-interval": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "keep-last-n-checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /configs/175B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 96, 10 | "hidden-size": 12288, 11 | "num-attention-heads": 96, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | # optimizer settings 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.00006, 27 | "betas": [0.9, 0.999], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | "zero_optimization": { 32 | "stage": 1, 33 | "allgather_partitions": True, 34 | "allgather_bucket_size": 500000000, 35 | "overlap_comm": True, 36 | "reduce_scatter": True, 37 | "reduce_bucket_size": 500000000, 38 | "contiguous_gradients": True, 39 | "cpu_offload": False 40 | }, 41 | 42 | # batch / data settings 43 | "train_micro_batch_size_per_gpu": 4, 44 | "data-impl": "mmap", 45 | "split": "949,50,1", 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "save-interval": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "keep-last-n-checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /configs/2-7B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 32, 10 | "hidden-size": 2560, 11 | "num-attention-heads": 32, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.00016, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # misc. training settings 71 | "train-iters": 320000, 72 | "lr-decay-iters": 320000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "save-interval": 10000, 77 | "eval-interval": 1000, 78 | "eval-iters": 10, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "keep-last-n-checkpoints": 4, 84 | "wall_clock_breakdown": true, 85 | } 86 | -------------------------------------------------------------------------------- /configs/6-7B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 32, 10 | "hidden-size": 4096, 11 | "num-attention-heads": 32, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.00012, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # misc. training settings 71 | "train-iters": 320000, 72 | "lr-decay-iters": 320000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "save-interval": 10000, 77 | "eval-interval": 1000, 78 | "eval-iters": 10, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "keep-last-n-checkpoints": 4, 84 | "wall_clock_breakdown": true, 85 | } 86 | -------------------------------------------------------------------------------- /configs/large.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 24, 10 | "hidden-size": 1536, 11 | "num-attention-heads": 16, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.00025, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # misc. training settings 71 | "train-iters": 320000, 72 | "lr-decay-iters": 320000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "save-interval": 10000, 77 | "eval-interval": 1000, 78 | "eval-iters": 10, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "keep-last-n-checkpoints": 4, 84 | "wall_clock_breakdown": true, 85 | } 86 | -------------------------------------------------------------------------------- /configs/medium.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 24, 10 | "hidden-size": 1024, 11 | "num-attention-heads": 16, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | 24 | # optimizer settings 25 | "optimizer": { 26 | "type": "Adam", 27 | "params": { 28 | "lr": 0.0003, 29 | "betas": [0.9, 0.999], 30 | "eps": 1.0e-8, 31 | } 32 | }, 33 | "zero_optimization": { 34 | "stage": 1, 35 | "allgather_partitions": True, 36 | "allgather_bucket_size": 500000000, 37 | "overlap_comm": True, 38 | "reduce_scatter": True, 39 | "reduce_bucket_size": 500000000, 40 | "contiguous_gradients": True, 41 | "cpu_offload": False 42 | }, 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # misc. training settings 71 | "train-iters": 320000, 72 | "lr-decay-iters": 320000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "save-interval": 10000, 77 | "eval-interval": 1000, 78 | "eval-iters": 10, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "keep-last-n-checkpoints": 4, 84 | "wall_clock_breakdown": true, 85 | } 86 | -------------------------------------------------------------------------------- /configs/myconfigs/model_config.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 4, 6 | "model-parallel-size": 2, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.0, 57 | "hidden-dropout": 0.0, 58 | "attention-dropout": 0.0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "save-interval": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "keep-last-n-checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /configs/bnb_small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "use-bnb-optimizer": true, 18 | 19 | # these should provide some speedup but takes a while to build, set to true if desired 20 | "scaled-upper-triang-masked-softmax-fusion": false, 21 | "bias-gelu-fusion": false, 22 | 23 | 24 | # optimizer settings 25 | "optimizer": { 26 | "type": "Adam", 27 | "params": { 28 | "lr": 0.0006, 29 | "betas": [0.9, 0.999], 30 | "eps": 1.0e-8, 31 | } 32 | }, 33 | "zero_optimization": { 34 | "stage": 0, 35 | "allgather_partitions": True, 36 | "allgather_bucket_size": 500000000, 37 | "overlap_comm": True, 38 | "reduce_scatter": True, 39 | "reduce_bucket_size": 500000000, 40 | "contiguous_gradients": True, 41 | "cpu_offload": False 42 | }, 43 | 44 | # batch / data settings 45 | "train_micro_batch_size_per_gpu": 4, 46 | "data-impl": "mmap", 47 | "split": "949,50,1", 48 | 49 | # activation checkpointing 50 | "checkpoint-activations": true, 51 | "checkpoint-num-layers": 1, 52 | "partition-activations": true, 53 | "synchronize-each-layer": true, 54 | 55 | # regularization 56 | "gradient_clipping": 1.0, 57 | "weight-decay": 0.0, 58 | "hidden-dropout": 0.0, 59 | "attention-dropout": 0.0, 60 | 61 | # precision settings 62 | "fp16": { 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # misc. training settings 71 | "train-iters": 320000, 72 | "lr-decay-iters": 320000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "save-interval": 10000, 77 | "eval-interval": 1000, 78 | "eval-iters": 10, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "keep-last-n-checkpoints": 4, 84 | "wall_clock_breakdown": true, 85 | } 86 | -------------------------------------------------------------------------------- /megatron/fused_kernels/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from torch.utils import cpp_extension 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | from torch.cuda import is_available as torch_cuda_available 5 | from pathlib import Path 6 | import subprocess 7 | 8 | 9 | def _get_cuda_bare_metal_version(cuda_dir): 10 | raw_output = subprocess.check_output( 11 | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True 12 | ) 13 | output = raw_output.split() 14 | release_idx = output.index("release") + 1 15 | release = output[release_idx].split(".") 16 | bare_metal_major = release[0] 17 | bare_metal_minor = release[1][0] 18 | 19 | return raw_output, bare_metal_major, bare_metal_minor 20 | 21 | 22 | srcpath = Path(__file__).parent.absolute() 23 | cc_flag = [] 24 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 25 | if int(bare_metal_major) >= 11: 26 | cc_flag.append("-gencode") 27 | cc_flag.append("arch=compute_80,code=sm_80") 28 | 29 | nvcc_flags = [ 30 | "-O3", 31 | "-gencode", 32 | "arch=compute_70,code=sm_70", 33 | "--use_fast_math", 34 | "-U__CUDA_NO_HALF_OPERATORS__", 35 | "-U__CUDA_NO_HALF_CONVERSIONS__", 36 | "--expt-relaxed-constexpr", 37 | "--expt-extended-lambda", 38 | ] 39 | cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag} 40 | layernorm_cuda_args = { 41 | "cxx": ["-O3"], 42 | "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"], 43 | } 44 | setup( 45 | name="fused_kernels", 46 | version="0.0.1", 47 | author="Sid Black & Alejandro Molina et al.", 48 | author_email="alejandro.molina@aleph-alpha.de", 49 | include_package_data=False, 50 | ext_modules=[ 51 | CUDAExtension( 52 | "scaled_upper_triang_masked_softmax_cuda", 53 | [ 54 | str(srcpath / "scaled_upper_triang_masked_softmax.cpp"), 55 | str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"), 56 | ], 57 | extra_compile_args=cuda_ext_args, 58 | ), 59 | CUDAExtension( 60 | "scaled_masked_softmax_cuda", 61 | [ 62 | str(srcpath / "scaled_masked_softmax.cpp"), 63 | str(srcpath / "scaled_masked_softmax_cuda.cu"), 64 | ], 65 | extra_compile_args=cuda_ext_args, 66 | ), 67 | ] 68 | if torch_cuda_available() 69 | else [], 70 | cmdclass={"build_ext": BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /megatron/model/norms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import LayerNorm as LayerNorm 3 | 4 | 5 | def get_norm(neox_args): 6 | if neox_args.norm == "rmsnorm": 7 | norm = RMSNorm 8 | eps = neox_args.rms_norm_epsilon 9 | elif neox_args.norm == "layernorm": 10 | eps = neox_args.layernorm_epsilon 11 | norm = LayerNorm 12 | elif neox_args.norm == "scalenorm": 13 | eps = neox_args.scalenorm_epsilon 14 | norm = ScaleNorm 15 | else: 16 | raise ValueError(f"norm {neox_args.norm} not recognized") 17 | return norm, eps 18 | 19 | 20 | class RMSNorm(torch.nn.Module): 21 | def __init__(self, dim, p=-1.0, eps=1e-8, bias=False): 22 | """ 23 | Root Mean Square Layer Normalization 24 | :param dim: model size 25 | :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled) 26 | :param eps: epsilon value, default 1e-8 27 | :param bias: whether use bias term for RMSNorm, disabled by 28 | default because RMSNorm doesn't enforce re-centering invariance. 29 | """ 30 | super(RMSNorm, self).__init__() 31 | 32 | self.eps = eps 33 | self.d = dim 34 | self.p = p 35 | self.bias = bias 36 | 37 | self.scale = torch.nn.Parameter(torch.ones(dim)) 38 | self.register_parameter("scale", self.scale) 39 | 40 | if self.bias: 41 | self.offset = torch.nn.Parameter(torch.zeros(dim)) 42 | self.register_parameter("offset", self.offset) 43 | 44 | def forward(self, x): 45 | if self.p < 0.0 or self.p > 1.0: 46 | norm_x = x.norm(2, dim=-1, keepdim=True) 47 | d_x = self.d 48 | else: 49 | partial_size = int(self.d * self.p) 50 | partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1) 51 | 52 | norm_x = partial_x.norm(2, dim=-1, keepdim=True) 53 | d_x = partial_size 54 | 55 | rms_x = norm_x * d_x ** (-1.0 / 2) 56 | x_normed = x / (rms_x + self.eps) 57 | 58 | if self.bias: 59 | return self.scale * x_normed + self.offset 60 | 61 | return self.scale * x_normed 62 | 63 | 64 | class ScaleNorm(torch.nn.Module): 65 | def __init__(self, dim, eps=1e-5): 66 | super().__init__() 67 | self.g = torch.nn.Parameter(torch.ones(1)) 68 | self.eps = eps 69 | 70 | def forward(self, x): 71 | n = torch.norm(x, dim=-1, keepdim=True).clamp(min=self.eps) 72 | return x / n * self.g 73 | -------------------------------------------------------------------------------- /configs/small_bf16.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.0, 57 | "hidden-dropout": 0.0, 58 | "attention-dropout": 0.0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "enabled": true, 63 | "type": "bfloat16", # set bf16 as precision 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32 71 | # misc. training settings 72 | "train-iters": 320000, 73 | "lr-decay-iters": 320000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "save-interval": 10000, 78 | "eval-interval": 1000, 79 | "eval-iters": 10, 80 | 81 | # logging 82 | "log-interval": 100, 83 | "steps_per_print": 10, 84 | "keep-last-n-checkpoints": 4, 85 | "wall_clock_breakdown": true, 86 | } 87 | -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Blendable dataset.""" 16 | 17 | import time 18 | 19 | import numpy as np 20 | import torch 21 | 22 | from megatron import print_rank_0 23 | from megatron import mpu 24 | 25 | 26 | class BlendableDataset(torch.utils.data.Dataset): 27 | def __init__(self, datasets, weights): 28 | self.datasets = datasets 29 | num_datasets = len(datasets) 30 | assert num_datasets == len(weights) 31 | 32 | self.size = 0 33 | for dataset in self.datasets: 34 | self.size += len(dataset) 35 | 36 | # Normalize weights. 37 | weights = np.array(weights, dtype=np.float64) 38 | sum_weights = np.sum(weights) 39 | assert sum_weights > 0.0 40 | weights /= sum_weights 41 | 42 | # Build indices. 43 | start_time = time.time() 44 | assert num_datasets < 255 45 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 46 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 47 | 48 | from megatron.data import helpers 49 | 50 | helpers.build_blending_indices( 51 | self.dataset_index, 52 | self.dataset_sample_index, 53 | weights, 54 | num_datasets, 55 | self.size, 56 | torch.distributed.get_rank() == 0, 57 | ) 58 | 59 | print( 60 | "> RANK {} elapsed time for building blendable dataset indices: " 61 | "{:.2f} (sec)".format( 62 | torch.distributed.get_rank(), time.time() - start_time 63 | ) 64 | ) 65 | 66 | def __len__(self): 67 | return self.size 68 | 69 | def __getitem__(self, idx): 70 | try: 71 | dataset_idx = self.dataset_index[idx] 72 | sample_idx = self.dataset_sample_index[idx] 73 | return self.datasets[dataset_idx][sample_idx] 74 | except IndexError: 75 | new_idx = idx % len(self) 76 | print( 77 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" 78 | ) 79 | return self[new_idx] 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # wandb logs 132 | wandb/ 133 | 134 | # data files 135 | data/**/*.idx 136 | data/**/*.bin 137 | data/**/*.json* 138 | data/**/*.txt 139 | data/**/*.gz 140 | data/**/*.np* 141 | data/**/*.npy 142 | checkpoints/ 143 | .vscode/ 144 | *.pt 145 | *.ckpt 146 | 147 | #test logs 148 | test_checkpoint/ 149 | test_logs/ 150 | logs/ 151 | tensorboard/ 152 | src/ 153 | -------------------------------------------------------------------------------- /megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import torch 17 | 18 | 19 | def ensure_divisibility(numerator, denominator): 20 | """Ensure that numerator is divisible by the denominator.""" 21 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 22 | numerator, denominator 23 | ) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False): 34 | """Split a tensor along its last dimension. 35 | Arguments: 36 | tensor: input tensor. 37 | num_partitions: number of partitions to split the tensor 38 | contiguous_split_chunks: If True, make each chunk contiguous 39 | in memory. 40 | """ 41 | # Get the size and dimension. 42 | last_dim = tensor.dim() - 1 43 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 44 | # Split. 45 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 46 | # Note: torch.split does not create contiguous tensors by default. 47 | if contiguous_split_chunks: 48 | return tuple(chunk.contiguous() for chunk in tensor_list) 49 | 50 | return tensor_list 51 | 52 | 53 | class VocabUtility: 54 | """Split the vocabulary into `world_size` chunks amd return the 55 | first and last index of the vocabulary belonging to the `rank` 56 | partition: Note that indices in [first, last]""" 57 | 58 | @staticmethod 59 | def vocab_range_from_per_partition_vocab_size( 60 | per_partition_vocab_size, rank, world_size 61 | ): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size 71 | ) 72 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor); 26 | 27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads, 28 | torch::Tensor const& softmax_results, 29 | float scale_factor); 30 | 31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) 32 | { 33 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 34 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 35 | (input.scalar_type() == at::ScalarType::BFloat16), 36 | "Only fp16 and bf16 are supported"); 37 | 38 | return fwd_cuda(input, scale_factor); 39 | } 40 | 41 | torch::Tensor bwd(torch::Tensor const& output_grads, 42 | torch::Tensor const& softmax_results, 43 | float scale_factor) 44 | { 45 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 46 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 47 | 48 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 49 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 50 | "Only fp16 and bf16 are supported"); 51 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 52 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | } // end namespace scaled_upper_triang_masked_softmax 59 | } // end namespace fused_softmax 60 | } // end namespace multihead_attn 61 | 62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 63 | { 64 | m.def("forward", 65 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 66 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 67 | m.def("backward", 68 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 69 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 70 | } 71 | -------------------------------------------------------------------------------- /megatron/neox_arguments/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | NeoX Arguments manages all configuration arguments. 3 | 4 | **general** 5 | 6 | * The implementation makes use of the python dataclass. 7 | * The main class 'NeoXArgs' (in ./arguments) exposes all configuration attributes that are relevant to GPT NeoX 8 | * No attributes are nested (apart from attributes with type dict) 9 | * Output functions (enable_logging, save_yml, print) are implemented 10 | * Instantiation always runs NeoXArgs.__post_init__(), which calculates derived values and performs a validation (values, types, keys). 11 | * it is possible to set undefined attributes (e.g. line of code 'NeoXArgs().my_undefined_config = 42' works fine); such set attributes are not validated 12 | * It is possible to update attributes (e.g. line of code 'NeoXArgs().do_train = True' works fine); a validation can be performed by calling the validation functions on the class instance 13 | * In order to avoid setting undefined attributes you can use the function NeoXArgs().update_value(); this function raises an error if the to be set attribute is not defined 14 | 15 | **instantiation** 16 | NeoX args can be instantiated with the following options 17 | 18 | * NeoXArgs.from_ymls(["path_to_yaml1", "path_to_yaml2", ...]): load yaml configuration files and instantiate with the values provided; checks for duplications and unknown arguments are performed 19 | * NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed 20 | 21 | * NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main 22 | * NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance. 23 | 24 | 25 | **code structure** 26 | 27 | * NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen 28 | * The Subclasses group args according to their purpose 29 | * The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be ommitted 30 | * The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be ommitted 31 | * calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items()) 32 | * refer to docstrings in code for more information 33 | """ 34 | 35 | 36 | from .arguments import NeoXArgs 37 | -------------------------------------------------------------------------------- /tests/test_configs/test_train_base.yml: -------------------------------------------------------------------------------- 1 | # GPT_2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 0, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 2, 10 | "hidden_size": 192, 11 | "num_attention_heads": 6, 12 | "seq_length": 1024, 13 | "max_position_embeddings": 1024, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled_upper_triang_masked_softmax_fusion": false, 20 | "bias_gelu_fusion": false, 21 | 22 | # optimizer settings 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.0006, 27 | "betas": [0.9, 0.999], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": True, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": True, 37 | "reduce_scatter": True, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": True, 40 | "cpu_offload": False 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data_impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.0, 57 | "hidden_dropout": 0.0, 58 | "attention_dropout": 0.0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | # misc. training settings 70 | "train_iters": 320000, 71 | "lr_decay_iters": 320000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "save_interval": 10000, 76 | "eval_interval": 1000, 77 | "eval_iters": 10, 78 | 79 | # logging 80 | "log_interval": 100, 81 | "steps_per_print": 10, 82 | "keep_last_n_checkpoints": 4, 83 | "wall_clock_breakdown": true, 84 | 85 | # Suggested data paths when using GPT_NeoX locally 86 | "data_path": "data/enron/enron_text_document", 87 | 88 | # or for weighted datasets: 89 | # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 90 | # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 91 | # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 92 | # "train-data-weights": [1., 2.], 93 | # "test-data-weights": [2., 1.], 94 | # "valid-data-weights": [0.5, 0.4], 95 | 96 | "vocab_file": "data/gpt2-vocab.json", 97 | "merge_file": "data/gpt2-merges.txt", 98 | "save": "test_checkpoint", 99 | "load": "test_checkpoint", 100 | "tensorboard_dir": "test_tensorboard", 101 | "log_dir": "test_logs", 102 | 103 | } 104 | -------------------------------------------------------------------------------- /configs/myconfigs/20B.yml: -------------------------------------------------------------------------------- 1 | # DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100 2 | # GPUs. Depending on your system configuration, you may need to change some parameters in order to fit 3 | # the model in memory. 4 | 5 | { 6 | # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in 7 | "vocab-file": "./20B_checkpoints/20B_tokenizer.json", 8 | "save": "./20B_checkpoints", 9 | "load": "~/slim_weights", 10 | 11 | # If finetuning, edit the following to the location of your finetuning dataset: 12 | "data-path": "./data/train_data/passage_crits_text_document", 13 | 14 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 15 | # across the node boundaries ) 16 | "pipe-parallel-size": 4, 17 | "model-parallel-size": 2, 18 | 19 | # model settings 20 | "num-layers": 44, 21 | "hidden-size": 6144, 22 | "num-attention-heads": 64, 23 | "seq-length": 2048, 24 | "max-position-embeddings": 2048, 25 | "norm": "layernorm", 26 | "pos-emb": "rotary", 27 | "rotary_pct": 0.25, 28 | "no-weight-tying": true, 29 | "gpt_j_residual": true, 30 | "output_layer_parallelism": "column", 31 | "scaled-upper-triang-masked-softmax-fusion": true, 32 | "bias-gelu-fusion": true, 33 | 34 | # init methods 35 | "init_method": "small_init", 36 | "output_layer_init_method": "wang_init", 37 | 38 | # optimizer settings 39 | "optimizer": { 40 | "type": "Adam", 41 | "params": { 42 | "lr": 0.97e-4, 43 | "betas": [0.9, 0.95], 44 | "eps": 1.0e-8, 45 | } 46 | }, 47 | 48 | "min_lr": 0.97e-5, 49 | "zero_optimization": { 50 | "stage": 1, 51 | "allgather_partitions": True, 52 | "allgather_bucket_size": 1260000000, 53 | "overlap_comm": True, 54 | "reduce_scatter": True, 55 | "reduce_bucket_size": 1260000000, 56 | "contiguous_gradients": True, 57 | "cpu_offload": False 58 | }, 59 | 60 | # batch / data settings (assuming 96 GPUs) 61 | "train_micro_batch_size_per_gpu": 4, 62 | "gradient_accumulation_steps": 32, 63 | "data-impl": "mmap", 64 | "split": "995,4,1", 65 | 66 | # activation checkpointing 67 | "checkpoint-activations": true, 68 | "checkpoint-num-layers": 1, 69 | "partition-activations": false, 70 | "synchronize-each-layer": true, 71 | 72 | # regularization 73 | "gradient_clipping": 1.0, 74 | "weight-decay": 0.01, 75 | "hidden-dropout": 0, 76 | "attention-dropout": 0, 77 | 78 | # precision settings 79 | "fp16": { 80 | "fp16": true, 81 | "enabled": true, 82 | "loss_scale": 0, 83 | "loss_scale_window": 1000, 84 | "initial_scale_power": 12, 85 | "hysteresis": 2, 86 | "min_loss_scale": 1 87 | }, 88 | 89 | # misc. training settings 90 | "train-iters": 150000, 91 | "lr-decay-iters": 150000, 92 | 93 | "distributed-backend": "nccl", 94 | "lr-decay-style": "cosine", 95 | "warmup": 0.01, 96 | "save-interval": 500, 97 | "eval-interval": 1000, 98 | "eval-iters": 10, 99 | 100 | # logging 101 | "log-interval": 2, 102 | "steps_per_print": 2, 103 | "wall_clock_breakdown": false, 104 | 105 | ### NEW DATA: #### 106 | "tokenizer_type": "HFTokenizer", 107 | "tensorboard-dir": "./tensorboard", 108 | "log-dir": "./logs", 109 | 110 | } 111 | -------------------------------------------------------------------------------- /configs/20B.yml: -------------------------------------------------------------------------------- 1 | # DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100 2 | # GPUs. Depending on your system configuration, you may need to change some parameters in order to fit 3 | # the model in memory. 4 | 5 | { 6 | # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in 7 | "vocab-file": "./20B_checkpoints/20B_tokenizer.json", 8 | "save": "./20B_checkpoints", 9 | "load": "~/slim_weights", 10 | 11 | # If finetuning, edit the following to the location of your finetuning dataset: 12 | "data-path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", 13 | 14 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 15 | # across the node boundaries ) 16 | "pipe-parallel-size": 4, 17 | "model-parallel-size": 2, 18 | 19 | # model settings 20 | "num-layers": 44, 21 | "hidden-size": 6144, 22 | "num-attention-heads": 64, 23 | "seq-length": 2048, 24 | "max-position-embeddings": 2048, 25 | "norm": "layernorm", 26 | "pos-emb": "rotary", 27 | "rotary_pct": 0.25, 28 | "no-weight-tying": true, 29 | "gpt_j_residual": true, 30 | "output_layer_parallelism": "column", 31 | "scaled-upper-triang-masked-softmax-fusion": true, 32 | "bias-gelu-fusion": true, 33 | 34 | # init methods 35 | "init_method": "small_init", 36 | "output_layer_init_method": "wang_init", 37 | 38 | # optimizer settings 39 | "optimizer": { 40 | "type": "Adam", 41 | "params": { 42 | "lr": 0.97e-4, 43 | "betas": [0.9, 0.95], 44 | "eps": 1.0e-8, 45 | } 46 | }, 47 | 48 | "min_lr": 0.97e-5, 49 | "zero_optimization": { 50 | "stage": 1, 51 | "allgather_partitions": True, 52 | "allgather_bucket_size": 1260000000, 53 | "overlap_comm": True, 54 | "reduce_scatter": True, 55 | "reduce_bucket_size": 1260000000, 56 | "contiguous_gradients": True, 57 | "cpu_offload": False 58 | }, 59 | 60 | # batch / data settings (assuming 96 GPUs) 61 | "train_micro_batch_size_per_gpu": 4, 62 | "gradient_accumulation_steps": 32, 63 | "data-impl": "mmap", 64 | "split": "995,4,1", 65 | 66 | # activation checkpointing 67 | "checkpoint-activations": true, 68 | "checkpoint-num-layers": 1, 69 | "partition-activations": false, 70 | "synchronize-each-layer": true, 71 | 72 | # regularization 73 | "gradient_clipping": 1.0, 74 | "weight-decay": 0.01, 75 | "hidden-dropout": 0, 76 | "attention-dropout": 0, 77 | 78 | # precision settings 79 | "fp16": { 80 | "fp16": true, 81 | "enabled": true, 82 | "loss_scale": 0, 83 | "loss_scale_window": 1000, 84 | "initial_scale_power": 12, 85 | "hysteresis": 2, 86 | "min_loss_scale": 1 87 | }, 88 | 89 | # misc. training settings 90 | "train-iters": 150000, 91 | "lr-decay-iters": 150000, 92 | 93 | "distributed-backend": "nccl", 94 | "lr-decay-style": "cosine", 95 | "warmup": 0.01, 96 | "save-interval": 500, 97 | "eval-interval": 1000, 98 | "eval-iters": 10, 99 | 100 | # logging 101 | "log-interval": 2, 102 | "steps_per_print": 2, 103 | "wall_clock_breakdown": false, 104 | 105 | ### NEW DATA: #### 106 | "tokenizer_type": "HFTokenizer", 107 | "tensorboard-dir": "./tensorboard", 108 | "log-dir": "./logs", 109 | 110 | } 111 | -------------------------------------------------------------------------------- /configs/gen_docs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.append( 5 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 6 | ) 7 | from megatron.neox_arguments import neox_args, deepspeed_args 8 | from inspect import getmembers, getsource 9 | from dataclasses import field, is_dataclass 10 | from itertools import tee, zip_longest 11 | import pathlib 12 | 13 | 14 | def pairwise(iterable): 15 | "s -> (s0,s1), (s1,s2), (s2, s3), ..." 16 | a, b = tee(iterable) 17 | next(b, None) 18 | return zip_longest(a, b) 19 | 20 | 21 | def get_docs(module): 22 | ARGS_CLASSES = getmembers(module, is_dataclass) 23 | results = {} 24 | for name, dcls in ARGS_CLASSES: 25 | assert is_dataclass(dcls) 26 | src = getsource(dcls) 27 | d = dcls() 28 | loc = 0 29 | results[name] = {"doc": d.__doc__.strip(), "attributes": {}} 30 | for cur, _next in pairwise(d.__dataclass_fields__.items()): 31 | field_name, field_def = cur 32 | field_type = field_def.type 33 | if hasattr(field_type, "__name__"): 34 | field_type = field_type.__name__ 35 | else: 36 | field_type = str(field_type) 37 | 38 | field_default = field_def.default 39 | 40 | # try to find the field definition 41 | loc = src.find(f" {field_name}:", loc + len(field_name) + 1) 42 | 43 | if _next is not None: 44 | next_field_name, _ = _next 45 | # try to find the next field definition 46 | next_loc = src.find(f"{next_field_name}:", loc + len(field_name)) 47 | else: 48 | next_loc = len(src) 49 | 50 | # try to get the docstring 51 | _src = src[loc:next_loc].strip() 52 | if '"""' in _src: 53 | doc = _src.split('"""')[1].strip() 54 | elif "'''" in _src: 55 | doc = _src.split("'''")[1].strip() 56 | else: 57 | doc = "" 58 | results[name]["attributes"][field_name] = { 59 | "name": field_name, 60 | "type": field_type, 61 | "default": field_default, 62 | "doc": doc, 63 | } 64 | return results 65 | 66 | 67 | def to_md(docs, intro_str=""): 68 | """ 69 | Writes the docs dictionary to markdown format 70 | """ 71 | lines = [] 72 | lines.append(intro_str) 73 | for name, doc in docs.items(): 74 | lines.append(f"## {name}") 75 | lines.append(f"{doc['doc']}") 76 | lines.append("") 77 | for field_name, field_def in doc["attributes"].items(): 78 | # attribute name and type 79 | lines.append(f"- **{field_name}**: {field_def['type']}") 80 | # default value 81 | lines.append(f" Default = {str(field_def['default'])}") 82 | lines.append(f" {field_def['doc']}") 83 | lines.append("") 84 | return "\n\n".join(lines) 85 | 86 | 87 | if __name__ == "__main__": 88 | docs = get_docs(neox_args) 89 | docs.update(get_docs(deepspeed_args)) 90 | intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n""" 91 | md = to_md(docs, intro_str=intro_str) 92 | with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f: 93 | f.write(md) 94 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2021 Josh Levy-Kramer . All rights reserved. 3 | # This file is based on code by the authors denoted below and has been modified from its original version. 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from megatron.utils import print_rank_0, setup_for_inference_or_eval 19 | 20 | from megatron.text_generation_utils import ( 21 | generate_samples_input_from_file, 22 | generate_samples_from_prompt, 23 | generate_samples_unconditional, 24 | generate_samples_interactive, 25 | ) 26 | 27 | 28 | def main(): 29 | """ 30 | Generate text/sample model 31 | """ 32 | model, neox_args = setup_for_inference_or_eval(use_cache=True) 33 | if neox_args.recompute: 34 | model.module.inference_mode( 35 | use_cache=False 36 | ) # don't use kv cache if recomputing 37 | if neox_args.text_gen_type == "unconditional": 38 | print_rank_0( 39 | f"Generating samples unconditionally and saving results to {neox_args.sample_output_file}" 40 | ) 41 | generate_samples_unconditional( 42 | neox_args=neox_args, 43 | model=model, 44 | number_of_samples=neox_args.num_samples, 45 | output_file=neox_args.sample_output_file, 46 | maximum_tokens=neox_args.maximum_tokens, 47 | recompute=neox_args.recompute, 48 | temperature=neox_args.temperature, 49 | top_k=neox_args.top_k, 50 | top_p=neox_args.top_p, 51 | ) 52 | 53 | elif neox_args.text_gen_type == "input-file": 54 | print_rank_0( 55 | f"Generating samples from input file {neox_args.sample_input_file}" 56 | ) 57 | assert neox_args.sample_input_file is not None 58 | generate_samples_input_from_file( 59 | neox_args=neox_args, 60 | model=model, 61 | input_file=neox_args.sample_input_file, 62 | output_file=neox_args.sample_output_file, 63 | maximum_tokens=neox_args.maximum_tokens, 64 | recompute=neox_args.recompute, 65 | temperature=neox_args.temperature, 66 | top_k=neox_args.top_k, 67 | top_p=neox_args.top_p, 68 | ) 69 | 70 | elif neox_args.text_gen_type == "interactive": 71 | generate_samples_interactive( 72 | neox_args=neox_args, 73 | model=model, 74 | recompute=neox_args.recompute, 75 | temperature=neox_args.temperature, 76 | maximum_tokens=neox_args.maximum_tokens, 77 | top_k=neox_args.top_k, 78 | top_p=neox_args.top_p, 79 | ) 80 | 81 | else: 82 | raise ValueError( 83 | f"`text-gen-type` either not specified or not recognised: {neox_args.text_gen_type}" 84 | ) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /tests/model/test_model_instantiation.py: -------------------------------------------------------------------------------- 1 | """ 2 | instantiate models with different configurations as a first possible point of failure 3 | """ 4 | 5 | import pytest 6 | 7 | import torch 8 | import os 9 | from ..common import distributed_test, model_setup, clear_test_dirs, parametrize, binary 10 | 11 | PARAMS_TO_TEST = { 12 | "pipe_parallel_size,model_parallel_size,world_size": [ 13 | [0, 1, 1], 14 | [1, 2, 2], 15 | [0, 2, 2], 16 | ], 17 | "no_weight_tying": binary, 18 | "attention_config": [ 19 | [[["global"], "all"]], 20 | [[["local"], "all"]], 21 | [[["sparse_variable"], "all"]], 22 | [[["sparse_fixed"], "all"]], 23 | ], 24 | "scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [ 25 | [True, False], 26 | [False, True], 27 | ], 28 | "fp16,fp32_allreduce": [ 29 | [ 30 | { 31 | "enabled": True, 32 | "type": "bfloat16", 33 | "loss_scale": 0, 34 | "loss_scale_window": 1000, 35 | "hysteresis": 2, 36 | "min_loss_scale": 1, 37 | }, 38 | True, 39 | ], 40 | [ 41 | { 42 | "enabled": True, 43 | "loss_scale": 0, 44 | "loss_scale_window": 1000, 45 | "hysteresis": 2, 46 | "min_loss_scale": 1, 47 | }, 48 | False, 49 | ], 50 | ], 51 | } 52 | 53 | parameters, names = parametrize( 54 | PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 55 | ) 56 | 57 | 58 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 59 | def test_instantiate(param_dict): 60 | @distributed_test(world_size=param_dict.pop("world_size", 2)) 61 | def wrapper(): 62 | run_test_model_instantiation(param_dict=param_dict) 63 | 64 | wrapper() 65 | 66 | 67 | OPTIMIZER_PARAMS = { 68 | "optimizer": [ 69 | {"type": "adam", "params": {"lr": 0.0006}}, 70 | {"type": "onebitadam", "params": {"lr": 0.0006}}, 71 | {"type": "cpu_adam", "params": {"lr": 0.0006}}, 72 | {"type": "cpu_torch_adam", "params": {"lr": 0.0006}}, 73 | {"type": "sm3", "params": {"lr": 0.0006}}, 74 | {"type": "madgrad_wd", "params": {"lr": 0.0006}}, 75 | ] 76 | } 77 | opt_params, opt_name = parametrize( 78 | OPTIMIZER_PARAMS, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 79 | ) 80 | 81 | 82 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 83 | def test_instantiate_optimizers(param_dict): 84 | @distributed_test(world_size=2) 85 | def wrapper(): 86 | run_test_model_instantiation(param_dict=param_dict) 87 | 88 | wrapper() 89 | 90 | 91 | def run_test_model_instantiation(yaml_list=None, param_dict=None): 92 | from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine 93 | 94 | model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict) 95 | if args_loaded.pipe_parallel_size < 2: 96 | assert isinstance(model, DeepSpeedEngine), "test model instantiation " + str( 97 | yaml_list 98 | ) 99 | else: 100 | assert isinstance(model, PipelineEngine), "test model instantiation " + str( 101 | yaml_list 102 | ) 103 | if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0: 104 | clear_test_dirs() 105 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor); 26 | 27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads, 28 | torch::Tensor const& softmax_results, 29 | float scale_factor); 30 | 31 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads); 32 | 33 | torch::Tensor fwd(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor) 34 | { 35 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 36 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 37 | (input.scalar_type() == at::ScalarType::BFloat16), 38 | "Only fp16 and bf16 are supported"); 39 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 40 | 41 | return fwd_cuda(input, mask, scale_factor); 42 | } 43 | 44 | torch::Tensor bwd(torch::Tensor const& output_grads, 45 | torch::Tensor const& softmax_results, 46 | float scale_factor) 47 | { 48 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 49 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 50 | 51 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 52 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 55 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 56 | "Only fp16 and bf16 are supported"); 57 | 58 | return bwd_cuda(output_grads, softmax_results, scale_factor); 59 | } 60 | 61 | int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads) 62 | { 63 | return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); 64 | } 65 | 66 | } // end namespace scaled_masked_softmax 67 | } // end namespace fused_softmax 68 | } // end namespace multihead_attn 69 | 70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 71 | { 72 | m.def("forward", 73 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 74 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 75 | 76 | m.def("backward", 77 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 78 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 79 | 80 | m.def("get_batch_per_block", 81 | &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block, 82 | "Return Batch per block size."); 83 | } 84 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "scaled_upper_triang_masked_softmax.h" 25 | #include "type_shim.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_upper_triang_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor) 32 | { 33 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 34 | const int attn_batches = input.size(0); 35 | const int seq_len = input.size(1); 36 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 37 | 38 | // Output 39 | auto act_options = input.options().requires_grad(false); 40 | torch::Tensor softmax_results = torch::empty({attn_batches, seq_len, seq_len}, act_options); 41 | 42 | // Softmax Intermediate Result Ptr 43 | void* input_ptr = static_cast(input.data_ptr()); 44 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 45 | 46 | DISPATCH_HALF_AND_BFLOAT( 47 | input.scalar_type(), 48 | "dispatch_scaled_upper_triang_masked_softmax_forward", 49 | dispatch_scaled_upper_triang_masked_softmax_forward( 50 | reinterpret_cast(softmax_results_ptr), 51 | reinterpret_cast(input_ptr), 52 | scale_factor, 53 | seq_len, 54 | seq_len, 55 | attn_batches);); 56 | return softmax_results; 57 | } 58 | 59 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads_, 60 | torch::Tensor const& softmax_results_, 61 | float scale_factor) 62 | { 63 | auto output_grads = output_grads_.contiguous(); 64 | auto softmax_results = softmax_results_.contiguous(); 65 | 66 | // output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 67 | const int attn_batches = output_grads.size(0); 68 | const int seq_len = output_grads.size(1); 69 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 70 | 71 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 72 | 73 | // Softmax Grad 74 | DISPATCH_HALF_AND_BFLOAT( 75 | output_grads_.scalar_type(), 76 | "dispatch_scaled_upper_triang_masked_softmax_backward", 77 | dispatch_scaled_upper_triang_masked_softmax_backward( 78 | reinterpret_cast(output_grads_ptr), 79 | reinterpret_cast(output_grads_ptr), 80 | reinterpret_cast(softmax_results.data_ptr()), 81 | scale_factor, 82 | seq_len, 83 | seq_len, 84 | attn_batches);); 85 | 86 | // backward pass is completely in-place 87 | return output_grads; 88 | } 89 | } // namespace scaled_upper_triang_masked_softmax 90 | } // namespace fused_softmax 91 | } // namespace multihead_attn 92 | -------------------------------------------------------------------------------- /tests/model/test_model_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | instantiate models, save checkpoints, load checkpoints, compare loaded parameters to saved parameters and compare forward pass outputs 3 | 4 | This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs 5 | to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures. 6 | """ 7 | 8 | 9 | import os 10 | 11 | if __name__ == "__main__": 12 | import sys 13 | 14 | sys.path.append(os.path.abspath("")) 15 | 16 | import pytest 17 | from tests.common import distributed_test, model_setup, parametrize, dict_repr 18 | import torch 19 | 20 | PARAMS_TO_TEST = { 21 | "pipe_parallel_size,model_parallel_size,world_size": [ 22 | [0, 1, 1], 23 | [0, 1, 2], 24 | [1, 2, 2], 25 | [0, 2, 2], 26 | [2, 1, 2], 27 | ], 28 | "top_p,temperature,top_k": [[0.0, 0.5, 0], [0.5, 0.0, 100], [0.5, 0.5, 0]], 29 | "prompt": ["", "hello world"], 30 | "fp16,fp32_allreduce": [ 31 | [ 32 | { 33 | "enabled": True, 34 | "type": "bfloat16", 35 | "loss_scale": 0, 36 | "loss_scale_window": 1000, 37 | "hysteresis": 2, 38 | "min_loss_scale": 1, 39 | }, 40 | True, 41 | ], 42 | [ 43 | { 44 | "enabled": True, 45 | "loss_scale": 0, 46 | "loss_scale_window": 1000, 47 | "hysteresis": 2, 48 | "min_loss_scale": 1, 49 | }, 50 | False, 51 | ], 52 | ], 53 | } 54 | 55 | parameters, names = parametrize( 56 | PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 57 | ) 58 | 59 | 60 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 61 | def test_train(param_dict): 62 | @distributed_test(world_size=param_dict.pop("world_size", 2)) 63 | def wrapper(): 64 | run_generate_test(param_dict=param_dict, prompt=param_dict.pop("prompt")) 65 | 66 | wrapper() 67 | 68 | 69 | def run_generate_test(param_dict, prompt): 70 | from megatron.text_generation_utils import generate_samples_from_prompt 71 | from megatron.utils import is_mp_rank_0 72 | 73 | fixed_params = { 74 | "num_samples": 3, 75 | "maximum_tokens": 50, 76 | "make_vocab_size_divisible_by": 2, 77 | "sample_output_file": "test_sample_output.txt", 78 | "checkpoint_activations": False, 79 | "partition_activations": False, 80 | "no_load_optim": True, 81 | } 82 | 83 | param_dict.update(fixed_params) 84 | # TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this 85 | model, _, _, args_loaded = model_setup( 86 | None, param_dict, clear_data=True, inference=True 87 | ) 88 | model.eval() 89 | 90 | prompts = [prompt for _ in range(args_loaded.num_samples)] 91 | output = generate_samples_from_prompt( 92 | neox_args=args_loaded, 93 | model=model, 94 | text=prompts, 95 | maximum_tokens=args_loaded.maximum_tokens, 96 | recompute=False, 97 | temperature=args_loaded.temperature, 98 | top_k=args_loaded.top_k, 99 | top_p=args_loaded.top_p, 100 | ) 101 | 102 | # outputs only get generated on mp rank 0 103 | if is_mp_rank_0(): 104 | assert len(output) == len(prompts) 105 | for prompt, out in zip(prompts, output): 106 | assert prompt == out["context"] 107 | assert len(out["text"]) > 0 108 | -------------------------------------------------------------------------------- /megatron/tokenizer/train_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Assumes a dataset of jsonl files in the same format as the neox training set. 3 | """ 4 | 5 | from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers 6 | from tokenizers.normalizers import NFKC 7 | 8 | from glob import glob 9 | import os 10 | import json 11 | import argparse 12 | 13 | 14 | def load_jsonl(input_path, quiet=True) -> list: 15 | """ 16 | Read list of objects from a JSON lines file. 17 | """ 18 | data = [] 19 | with open(input_path, "r", encoding="utf-8") as f: 20 | for line in f: 21 | data.append(json.loads(line.rstrip("\n|\r"))) 22 | if not quiet: 23 | print("Loaded {} records from {}".format(len(data), input_path)) 24 | return data 25 | 26 | 27 | def json_iterator(input_dir, text_key="text"): 28 | all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json") 29 | for j in all_jsonls: 30 | data = load_jsonl(j) 31 | for doc in data: 32 | yield doc[text_key] 33 | 34 | 35 | def train_tokenizer( 36 | input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000 37 | ): 38 | """ 39 | Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` 40 | 41 | :param input_dir: input directory containing jsonl files 42 | :param save_path: path to save tokenizer to 43 | :param tokenizer_type: type of tokenizer to train. 44 | :param vocab_size: int, size of tokenizer's vocab 45 | :return: 46 | """ 47 | 48 | if tokenizer_type == "BPE": 49 | model = models.BPE() 50 | else: 51 | raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented") 52 | tokenizer = Tokenizer(model) 53 | 54 | # Customize pre-tokenization and decoding 55 | tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) 56 | tokenizer.decoder = decoders.ByteLevel() 57 | tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) 58 | tokenizer.normalizer = NFKC() 59 | 60 | # And then train 61 | trainer = trainers.BpeTrainer( 62 | vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"] 63 | ) 64 | tokenizer.train_from_iterator(json_iterator(input_dir), trainer) 65 | 66 | # And Save it 67 | tokenizer.save(save_path, pretty=True) 68 | print(f"Tokenizer saved at {save_path}") 69 | 70 | 71 | def parse_args(): 72 | parser = argparse.ArgumentParser( 73 | description="script for training a multilingual " 74 | "HF tokenizer on CC dumps with upweighting for low resource languages" 75 | ) 76 | parser.add_argument( 77 | "--json_input_dir", 78 | type=str, 79 | help="Path to folder containing tokenizer training data in jsonl format", 80 | ) 81 | parser.add_argument( 82 | "--tokenizer_output_path", 83 | type=str, 84 | help="Path to which your trained tokenizer will be saved (should end in .json)", 85 | ) 86 | parser.add_argument( 87 | "--tokenizer_type", 88 | type=str, 89 | help="type of tokenizer to train, currently only BPE is supported", 90 | choices=["BPE"], 91 | default=["BPE"], 92 | ) 93 | parser.add_argument( 94 | "-v", 95 | "--vocab_size", 96 | help="vocabulary size of tokenizer, default=52k", 97 | type=int, 98 | default=52000, 99 | ) 100 | return parser.parse_args() 101 | 102 | 103 | if __name__ == "__main__": 104 | 105 | args = parse_args() 106 | 107 | train_tokenizer( 108 | args.json_input_dir, 109 | save_path=args.tokenizer_output_path, 110 | tokenizer_type=args.tokenizer_type, 111 | vocab_size=args.vocab_size, 112 | ) 113 | -------------------------------------------------------------------------------- /tests/model/test_model_checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | instantiate models, save checkpoints, load checkpoints, compare loaded parameters to saved parameters and compare forward pass outputs 3 | 4 | This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs 5 | to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures. 6 | """ 7 | import os 8 | 9 | if __name__ == "__main__": 10 | import sys 11 | 12 | sys.path.append(os.path.abspath("")) 13 | 14 | import pytest 15 | from tests.common import ( 16 | distributed_test, 17 | clear_test_dirs, 18 | model_setup, 19 | binary, 20 | parametrize, 21 | ) 22 | import torch 23 | 24 | PARAMS_TO_TEST = { 25 | "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2], [2, 1]], 26 | "checkpoint_validation_with_forward_pass": [True], 27 | "fp16,fp32_allreduce": [ 28 | [ 29 | { 30 | "enabled": True, 31 | "type": "bfloat16", 32 | "loss_scale": 0, 33 | "loss_scale_window": 1000, 34 | "hysteresis": 2, 35 | "min_loss_scale": 1, 36 | }, 37 | True, 38 | ], 39 | [ 40 | { 41 | "enabled": True, 42 | "loss_scale": 0, 43 | "loss_scale_window": 1000, 44 | "hysteresis": 2, 45 | "min_loss_scale": 1, 46 | }, 47 | False, 48 | ], 49 | ], 50 | } 51 | 52 | parameters, names = parametrize( 53 | PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 54 | ) 55 | 56 | 57 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 58 | def test_train(param_dict): 59 | @distributed_test(world_size=2) 60 | def wrapper(): 61 | run_checkpoint_test(param_dict=param_dict) 62 | 63 | wrapper() 64 | 65 | 66 | def run_checkpoint_test(yaml_list=None, param_dict=None): 67 | 68 | from megatron.checkpointing import load_checkpoint 69 | from megatron.checkpointing import save_checkpoint 70 | 71 | model, optimizer, lr_scheduler, args_loaded = model_setup( 72 | yaml_list, param_dict, clear_data=True 73 | ) 74 | 75 | # save model checkpoint 76 | save_checkpoint( 77 | neox_args=args_loaded, 78 | iteration=42, 79 | model=model, 80 | optimizer=optimizer, 81 | lr_scheduler=lr_scheduler, 82 | ) 83 | 84 | # reload model from checkpoint 85 | ( 86 | reloaded_model, 87 | reloaded_optimizer, 88 | reloaded_lr_scheduler, 89 | args_reloaded, 90 | ) = model_setup(yaml_list, param_dict, clear_data=False) 91 | iteration = load_checkpoint( 92 | neox_args=args_reloaded, 93 | model=reloaded_model, 94 | optimizer=reloaded_optimizer, 95 | lr_scheduler=reloaded_lr_scheduler, 96 | ) 97 | 98 | # ensure same checkpoint is loaded 99 | assert ( 100 | iteration == 42 101 | ), "run_checkpoint_test() iteration loaded from checkpoint correct" 102 | 103 | # check all weight groups are the same 104 | for idx, ((n1, p1), (n2, p2)) in enumerate( 105 | zip( 106 | list(model.module.named_parameters()), 107 | list(reloaded_model.module.named_parameters()), 108 | ) 109 | ): 110 | assert n1 == n2 111 | params_equal = (p1 == p2).all().item() 112 | assert params_equal, "run_checkpoint_test() params equal: " + str(n1) 113 | 114 | if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0: 115 | clear_test_dirs() 116 | 117 | 118 | if __name__ == "__main__": 119 | params = list( 120 | parametrize( 121 | PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 122 | ) 123 | ) 124 | test_train(params[0]) 125 | -------------------------------------------------------------------------------- /megatron/mpu/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from .initialize import get_model_parallel_group 18 | from .initialize import get_model_parallel_rank 19 | from .initialize import get_model_parallel_src_rank 20 | 21 | 22 | _MAX_DATA_DIM = 4 23 | 24 | 25 | def _check_data_types(keys, data, target_dtype): 26 | """Check that all the keys have the same target data type.""" 27 | for key in keys: 28 | assert ( 29 | data[key].dtype == target_dtype 30 | ), "{} has data type {} which " "is different than {}".format( 31 | key, data[key].dtype, target_dtype 32 | ) 33 | 34 | 35 | def _build_key_size_numel_dictionaries(keys, data): 36 | """Build the size on rank 0 and broadcast.""" 37 | max_dim = _MAX_DATA_DIM 38 | sizes = [0 for _ in range(max_dim) for _ in keys] 39 | 40 | # Pack the sizes on rank zero. 41 | if get_model_parallel_rank() == 0: 42 | offset = 0 43 | for key in keys: 44 | assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM" 45 | size = data[key].size() 46 | for i, s in enumerate(size): 47 | sizes[i + offset] = s 48 | offset += max_dim 49 | 50 | # Move to GPU and broadcast. 51 | sizes_cuda = torch.cuda.LongTensor(sizes) 52 | torch.distributed.broadcast( 53 | sizes_cuda, get_model_parallel_src_rank(), group=get_model_parallel_group() 54 | ) 55 | 56 | # Move back to cpu and unpack. 57 | sizes_cpu = sizes_cuda.cpu() 58 | key_size = {} 59 | key_numel = {} 60 | total_numel = 0 61 | offset = 0 62 | for key in keys: 63 | i = 0 64 | size = [] 65 | numel = 1 66 | while sizes_cpu[offset + i] > 0: 67 | this_size = sizes_cpu[offset + i] 68 | size.append(this_size) 69 | numel *= this_size 70 | i += 1 71 | key_size[key] = size 72 | key_numel[key] = numel 73 | total_numel += numel 74 | offset += max_dim 75 | 76 | return key_size, key_numel, total_numel 77 | 78 | 79 | def broadcast_data(keys, data, datatype): 80 | """Broadcast data from rank zero of each model parallel group to the 81 | members of the same model parallel group. 82 | 83 | Arguments: 84 | keys: list of keys in the data disctionary to be broadcasted 85 | data: data dictionary of string keys and cpu tensor values. 86 | datatype: torch data type of all tensors in data associated 87 | with keys. 88 | """ 89 | # Build (key, size) and (key, number of elements) dictionaries along 90 | # with the total number of elements on all ranks. 91 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data) 92 | 93 | # Pack on rank zero. 94 | if get_model_parallel_rank() == 0: 95 | # Check that all keys have the same data type. 96 | _check_data_types(keys, data, datatype) 97 | # Flatten the data associated with the keys 98 | flatten_data = torch.cat( 99 | [data[key].contiguous().view(-1) for key in keys], dim=0 100 | ).cuda() 101 | else: 102 | flatten_data = torch.empty( 103 | total_numel, device=torch.cuda.current_device(), dtype=datatype 104 | ) 105 | 106 | # Boradcast 107 | torch.distributed.broadcast( 108 | flatten_data, get_model_parallel_src_rank(), group=get_model_parallel_group() 109 | ) 110 | 111 | # Unpack 112 | output = {} 113 | offset = 0 114 | for key in keys: 115 | size = key_size[key] 116 | numel = key_numel[key] 117 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 118 | offset += numel 119 | 120 | return output 121 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | #### System package (uses default Python 3 version in Ubuntu 20.04) 6 | RUN apt-get update -y && \ 7 | apt-get install -y \ 8 | git python3 python3-dev libpython3-dev python3-pip sudo pdsh \ 9 | htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \ 10 | nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \ 11 | rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \ 12 | rdmacm-utils perftest rdma-core nano && \ 13 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ 14 | update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ 15 | pip install --upgrade pip && \ 16 | pip install gpustat 17 | 18 | ### SSH 19 | # Set password 20 | RUN echo 'password' >> password.txt && \ 21 | mkdir /var/run/sshd && \ 22 | echo "root:`cat password.txt`" | chpasswd && \ 23 | # Allow root login with password 24 | sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ 25 | # Prevent user being kicked off after login 26 | sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \ 27 | echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ 28 | echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \ 29 | # FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42 30 | echo "Set disable_coredump false" >> /etc/sudo.conf && \ 31 | # Clean up 32 | rm password.txt 33 | 34 | # Expose SSH port 35 | EXPOSE 22 36 | 37 | #### OPENMPI 38 | ENV OPENMPI_BASEVERSION=4.1 39 | ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0 40 | RUN mkdir -p /build && \ 41 | cd /build && \ 42 | wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ 43 | cd openmpi-${OPENMPI_VERSION} && \ 44 | ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ 45 | make -j"$(nproc)" install && \ 46 | ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ 47 | # Sanity check: 48 | test -f /usr/local/mpi/bin/mpic++ && \ 49 | cd ~ && \ 50 | rm -rf /build 51 | 52 | # Needs to be in docker PATH if compiling other items & bashrc PATH (later) 53 | ENV PATH=/usr/local/mpi/bin:${PATH} \ 54 | LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} 55 | 56 | # Create a wrapper for OpenMPI to allow running as root by default 57 | RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ 58 | echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ 59 | echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ 60 | chmod a+x /usr/local/mpi/bin/mpirun 61 | 62 | #### User account 63 | RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \ 64 | usermod -aG sudo mchorse && \ 65 | echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers 66 | 67 | ## SSH config and bashrc 68 | RUN mkdir -p /home/mchorse/.ssh /job && \ 69 | echo 'Host *' > /home/mchorse/.ssh/config && \ 70 | echo ' StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \ 71 | echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \ 72 | echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \ 73 | echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \ 74 | echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc 75 | 76 | #### Python packages 77 | RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge 78 | COPY requirements/requirements.txt . 79 | COPY requirements/requirements-onebitadam.txt . 80 | COPY requirements/requirements-sparseattention.txt . 81 | RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && pip install -r requirements-sparseattention.txt && pip cache purge 82 | 83 | ## Install APEX 84 | RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597 85 | 86 | # Clear staging 87 | RUN mkdir -p /tmp && chmod 0777 /tmp 88 | 89 | #### SWITCH TO mchorse USER 90 | USER mchorse 91 | WORKDIR /home/mchorse 92 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_commandline.py: -------------------------------------------------------------------------------- 1 | """ 2 | verify parsing and handover of command line arguments 3 | """ 4 | import pytest 5 | import sys 6 | from unittest.mock import patch 7 | 8 | from ..common import get_root_directory, get_config_directory, get_configs_with_path 9 | 10 | 11 | @pytest.mark.cpu 12 | def test_neoxargs_consume_deepy_args_with_config_dir(): 13 | """ 14 | verify consume_deepy_args processes command line arguments without config dir 15 | """ 16 | 17 | from megatron.neox_arguments import NeoXArgs 18 | 19 | # load neox args with command line 20 | with patch( 21 | "sys.argv", 22 | [str(get_root_directory() / "deepy.py"), "pretrain_gpt2.py"] 23 | + get_configs_with_path(["small.yml", "local_setup.yml"]), 24 | ): 25 | args_loaded_consume = NeoXArgs.consume_deepy_args() 26 | 27 | # load neox args directly from yaml files 28 | args_loaded_yamls = NeoXArgs.from_ymls( 29 | get_configs_with_path(["small.yml", "local_setup.yml"]) 30 | ) 31 | 32 | # update values from yaml files that cannot otherwise be matched 33 | args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py") 34 | args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group 35 | 36 | assert args_loaded_yamls == args_loaded_consume 37 | 38 | 39 | @pytest.mark.cpu 40 | def test_neoxargs_consume_deepy_args_without_yml_suffix(): 41 | """ 42 | verify consume_deepy_args processes command line arguments without yaml suffix 43 | """ 44 | 45 | from megatron.neox_arguments import NeoXArgs 46 | 47 | # load neox args with command line 48 | with patch( 49 | "sys.argv", 50 | [str(get_root_directory() / "deepy.py"), "pretrain_gpt2.py"] 51 | + get_configs_with_path(["small", "local_setup"]), 52 | ): 53 | args_loaded_consume = NeoXArgs.consume_deepy_args() 54 | 55 | # load neox args directly from yaml files 56 | args_loaded_yamls = NeoXArgs.from_ymls( 57 | get_configs_with_path(["small.yml", "local_setup.yml"]) 58 | ) 59 | 60 | # update values from yaml files that cannot otherwise be matched 61 | args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py") 62 | args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group 63 | 64 | assert args_loaded_yamls == args_loaded_consume 65 | 66 | 67 | @pytest.mark.cpu 68 | def test_neoxargs_consume_deepy_args_with_config_dir(): 69 | """ 70 | verify consume_deepy_args processes command line arguments including config dir 71 | """ 72 | 73 | from megatron.neox_arguments import NeoXArgs 74 | 75 | # load neox args with command line 76 | with patch( 77 | "sys.argv", 78 | [ 79 | str(get_root_directory() / "deepy.py"), 80 | "pretrain_gpt2.py", 81 | "-d", 82 | str(get_config_directory()), 83 | ] 84 | + ["small.yml", "local_setup.yml"], 85 | ): 86 | args_loaded_consume = NeoXArgs.consume_deepy_args() 87 | 88 | # load neox args directly from yaml files 89 | args_loaded_yamls = NeoXArgs.from_ymls( 90 | get_configs_with_path(["small.yml", "local_setup.yml"]) 91 | ) 92 | 93 | # update values from yaml files that cannot otherwise be matched 94 | args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py") 95 | args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group 96 | 97 | assert args_loaded_yamls == args_loaded_consume 98 | 99 | 100 | @pytest.mark.cpu 101 | def test_neoxargs_consume_neox_args(): 102 | """ 103 | verify megatron args are correctly consumed after sending via deepspeed 104 | """ 105 | from megatron.neox_arguments import NeoXArgs 106 | 107 | # intitially load config from files as would be the case in deepy.py 108 | yaml_list = get_configs_with_path(["small.yml", "local_setup.yml"]) 109 | args_baseline = NeoXArgs.from_ymls(yaml_list) 110 | args_baseline.update_value( 111 | "user_script", str(get_root_directory() / "pretrain_gpt2.py") 112 | ) 113 | deepspeed_main_args = args_baseline.get_deepspeed_main_args() 114 | 115 | # patch sys.argv so that args can be access by set_global_variables within initialize_megatron 116 | with patch("sys.argv", deepspeed_main_args): 117 | args_loaded = NeoXArgs.consume_neox_args() 118 | 119 | # TODO is the wandb group really to be changed? 120 | args_loaded.wandb_group = args_baseline.wandb_group 121 | assert args_baseline.megatron_config == args_loaded.megatron_config 122 | -------------------------------------------------------------------------------- /megatron/model/init_functions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | def init_method_normal(sigma): 7 | """Init method based on N(0, sigma).""" 8 | 9 | def init_(tensor): 10 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 11 | 12 | return init_ 13 | 14 | 15 | def scaled_init_method_normal(sigma, num_layers): 16 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 17 | std = sigma / math.sqrt(2.0 * num_layers) 18 | 19 | def init_(tensor): 20 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 21 | 22 | return init_ 23 | 24 | 25 | # orthogonal init does not support fp16, so have to patch it 26 | def _orthogonal(tensor, gain=1): 27 | if tensor.ndimension() < 2: 28 | raise ValueError("Only tensors with 2 or more dimensions are supported") 29 | 30 | rows = tensor.size(0) 31 | cols = tensor.numel() // rows 32 | flattened = tensor.new(rows, cols).normal_(0, 1) 33 | 34 | if rows < cols: 35 | flattened.t_() 36 | 37 | # Compute the qr factorization 38 | dt = flattened.dtype 39 | flattened = flattened.to(torch.float32) # orthogonal init does not support fp16 40 | q, r = torch.qr(flattened) 41 | q, r = q.to(dtype=dt), r.to(dtype=dt) 42 | # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf 43 | d = torch.diag(r, 0) 44 | ph = d.sign() 45 | q *= ph 46 | 47 | if rows < cols: 48 | q.t_() 49 | 50 | with torch.no_grad(): 51 | tensor.view_as(q).copy_(q) 52 | tensor.mul_(gain) 53 | return tensor 54 | 55 | 56 | def orthogonal_init_method(n_layers=1): 57 | """Fills the input Tensor with a (semi) orthogonal matrix, as described in 58 | Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013) 59 | Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)""" 60 | 61 | def init_(tensor): 62 | return _orthogonal(tensor, math.sqrt(2 / n_layers)) 63 | 64 | return init_ 65 | 66 | 67 | def xavier_uniform_init_method(): 68 | """Fills the input Tensor with values according to the method described in Understanding the difficulty of 69 | training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution.""" 70 | 71 | def init_(tensor): 72 | return torch.nn.init.xavier_uniform_(tensor) 73 | 74 | return init_ 75 | 76 | 77 | def xavier_normal_init_method(): 78 | """Fills the input Tensor with values according to the method described in Understanding the difficulty of 79 | training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution.""" 80 | 81 | def init_(tensor): 82 | return torch.nn.init.xavier_normal_(tensor) 83 | 84 | return init_ 85 | 86 | 87 | def small_init_init_method(dim): 88 | """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving 89 | the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution.""" 90 | std = math.sqrt(2 / (5 * dim)) 91 | 92 | def init_(tensor): 93 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 94 | 95 | return init_ 96 | 97 | 98 | def wang_init_method(n_layers, dim): 99 | std = 2 / n_layers / math.sqrt(dim) 100 | 101 | def init_(tensor): 102 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 103 | 104 | return init_ 105 | 106 | 107 | def get_init_methods(args): 108 | def _get(name): 109 | if name == "normal": 110 | return init_method_normal(args.init_method_std) 111 | elif name == "scaled_normal": 112 | return scaled_init_method_normal(args.init_method_std, args.num_layers) 113 | elif name == "orthogonal": 114 | return orthogonal_init_method() 115 | elif name == "scaled_orthogonal": 116 | return orthogonal_init_method(args.num_layers) 117 | elif name == "xavier_uniform": 118 | return xavier_uniform_init_method() 119 | elif name == "xavier_normal": 120 | return xavier_normal_init_method() 121 | elif name == "wang_init": 122 | return wang_init_method(args.num_layers, args.hidden_size) 123 | elif name == "small_init": 124 | return small_init_init_method(args.hidden_size) 125 | else: 126 | raise NotImplementedError(f"Unkown init method {name}") 127 | 128 | return _get(args.init_method), _get(args.output_layer_init_method) 129 | -------------------------------------------------------------------------------- /megatron/model/activations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | 18 | torch._C._jit_set_profiling_mode(False) 19 | torch._C._jit_set_profiling_executor(False) 20 | torch._C._jit_override_can_fuse_on_cpu(True) 21 | torch._C._jit_override_can_fuse_on_gpu(True) 22 | 23 | 24 | def get_activation(neox_args): 25 | """retrieves the activation function specified in neox_args""" 26 | if neox_args.activation == "geglu": 27 | activation_func = GEGLU(neox_args=neox_args) 28 | elif neox_args.activation == "gelu": 29 | if neox_args.onnx_safe and neox_args.bias_gelu_fusion: 30 | raise ValueError("onnx_safe + bias_gelu_fusion not compatible") 31 | if neox_args.onnx_safe: 32 | activation_func = erf_gelu 33 | elif neox_args.bias_gelu_fusion: 34 | activation_func = bias_gelu_impl 35 | else: 36 | activation_func = F.gelu 37 | elif neox_args.activation == "relu": 38 | activation_func = F.relu 39 | elif neox_args.activation == "softsign": 40 | activation_func = F.softsign 41 | elif neox_args.activation == "swish": 42 | activation_func = swish 43 | elif neox_args.activation == "mish": 44 | activation_func = mish 45 | else: 46 | raise ValueError(f"Activation function {neox_args.activation} not recognized") 47 | return activation_func 48 | 49 | 50 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 51 | # 1/sqrt(2*pi)-> 0.3989423 52 | # 1/sqrt(2) -> 0.70710678 53 | # sqrt(2/pi) -> 0.79788456 54 | # this function is tanh approximation of gelu 55 | # actual gelu is: 56 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 57 | 58 | 59 | @torch.jit.script 60 | def bias_gelu(bias, y): 61 | x = bias + y 62 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 63 | 64 | 65 | # gradient of tanh approximation of gelu 66 | # gradient of actual gelu is: 67 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 68 | @torch.jit.script 69 | def bias_gelu_back(g, bias, y): 70 | x = bias + y 71 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 72 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 73 | ff = 0.5 * x * ( 74 | (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x) 75 | ) + 0.5 * (1 + tanh_out) 76 | return ff * g 77 | 78 | 79 | class GeLUFunction(torch.autograd.Function): 80 | @staticmethod 81 | # bias is an optional argument 82 | def forward(ctx, input, bias): 83 | ctx.save_for_backward(input, bias) 84 | return bias_gelu(bias, input) 85 | 86 | @staticmethod 87 | def backward(ctx, grad_output): 88 | input, bias = ctx.saved_tensors 89 | tmp = bias_gelu_back(grad_output, bias, input) 90 | return tmp, tmp 91 | 92 | 93 | bias_gelu_impl = GeLUFunction.apply 94 | 95 | 96 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 97 | @torch.jit.script 98 | def erf_gelu(x): 99 | return ( 100 | x 101 | * 0.5 102 | * ( 103 | torch.erf(x / 1.41421).to(dtype=x.dtype) 104 | + torch.ones_like(x).to(dtype=x.dtype) 105 | ) 106 | ) 107 | 108 | 109 | @torch.jit.script 110 | def swish(x, beta: float = 1.0): 111 | return x * torch.sigmoid(beta * x) 112 | 113 | 114 | @torch.jit.script 115 | def mish(x): 116 | return x * torch.tanh(F.softplus(x)) 117 | 118 | 119 | class GEGLU(torch.nn.Module): 120 | def __init__(self, neox_args): 121 | super(GEGLU, self).__init__() 122 | if neox_args.onnx_safe: 123 | self.activation_func = erf_gelu 124 | else: 125 | self.activation_func = F.gelu 126 | 127 | def forward(self, x, bias=None): 128 | x, gate = x.chunk(2, dim=-1) 129 | if bias is not None: 130 | bias_1, bias_2 = bias.chunk(2, dim=-1) 131 | x = x + bias_1 132 | gate = gate + bias_2 133 | intermediate_parallel = self.activation_func(gate) 134 | return intermediate_parallel * x 135 | -------------------------------------------------------------------------------- /megatron/model/gmlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from megatron.model.fused_softmax import FusedScaleMaskSoftmax 6 | from megatron.model.activations import get_activation 7 | from megatron.model.norms import get_norm 8 | from megatron.model.utils import get_fusion_type 9 | 10 | from megatron import mpu 11 | 12 | 13 | class TinyAttention(nn.Module): 14 | def __init__(self, neox_args, d_attn, d_ff, mask_fn): 15 | super().__init__() 16 | self.proj_qkv = nn.Linear(d_ff * 2, 3 * d_attn) 17 | self.scale = d_attn ** -0.5 18 | self.proj_ffn = nn.Linear(d_attn, d_ff) 19 | self.softmax = FusedScaleMaskSoftmax( 20 | input_in_fp16=neox_args.precision == "fp16", 21 | input_in_bf16=neox_args.precision == "bfloat16", 22 | fusion_type=get_fusion_type(neox_args), 23 | mask_func=mask_fn, 24 | softmax_in_fp32=neox_args.attention_softmax_in_fp32, 25 | scale=None, 26 | ) 27 | 28 | def forward(self, x, attention_mask): 29 | q, k, v = torch.chunk(self.proj_qkv(x), 3, dim=-1) 30 | w = torch.einsum("bnd,bmd->bnm", q, k).unsqueeze(1) * self.scale 31 | a = self.softmax( 32 | w, mask=attention_mask[..., : w.size(-2), : w.size(-1)] 33 | ).squeeze(1) 34 | x = torch.einsum("bnm,bmd->bnd", a, v) 35 | return self.proj_ffn(x) 36 | 37 | 38 | class SpatialGatingUnit(nn.Module): 39 | def __init__(self, neox_args, d_ff, d_attn=None, causal=True, mask_fn=None): 40 | super().__init__() 41 | self.causal = causal 42 | self.use_attn = d_attn is not None 43 | 44 | norm, eps = get_norm(neox_args) 45 | self.norm = norm(d_ff, eps=eps) 46 | self.proj = nn.Linear(neox_args.seq_length, neox_args.seq_length) 47 | if self.use_attn: 48 | assert mask_fn is not None 49 | self.attn = TinyAttention( 50 | neox_args=neox_args, d_attn=d_attn, d_ff=d_ff, mask_fn=mask_fn 51 | ) 52 | nn.init.zeros_(self.proj.weight) 53 | nn.init.constant_(self.proj.bias, 1.0) 54 | 55 | def forward(self, x, attention_mask): 56 | device, n = x.device, x.shape[1] 57 | x = x.transpose(0, 1) # [s, b, d] -> [b, s, d] 58 | 59 | res, gate = x.chunk(2, dim=-1) # split along dim 60 | gate = self.norm(gate) 61 | 62 | weight, bias = self.proj.weight, self.proj.bias 63 | if self.causal: 64 | weight, bias = weight[:n, :n], bias[:n] 65 | mask = torch.ones(weight.shape[:2], device=device).triu_(1).bool() 66 | weight = weight.masked_fill(mask, 0.0) 67 | 68 | gate = F.linear(gate.transpose(2, 1), weight, self.proj.bias).transpose(2, 1) 69 | 70 | if self.use_attn: 71 | gate = gate + self.attn(x, attention_mask) 72 | 73 | return (gate * res).transpose(0, 1) # [b, s, d] -> [s, b, d] 74 | 75 | 76 | class GMLPBlock(nn.Module): 77 | def __init__( 78 | self, 79 | neox_args, 80 | init_method, 81 | output_layer_init_method, 82 | layer_number, 83 | ff_mult=4, 84 | mask_fn=None, 85 | ): 86 | super().__init__() 87 | self.layer_number = layer_number 88 | 89 | ff_dim = neox_args.hidden_size * ff_mult 90 | norm, eps = get_norm(neox_args) 91 | self.norm = norm(neox_args.hidden_size, eps=eps) 92 | self.input_linear = mpu.ColumnParallelLinear( 93 | neox_args=neox_args, 94 | input_size=neox_args.hidden_size, 95 | output_size=ff_dim * 2, 96 | gather_output=False, 97 | init_method=init_method, 98 | skip_bias_add=True, 99 | ) 100 | self.activation_func = get_activation(neox_args) 101 | ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size()) 102 | if neox_args.attention_config[layer_number] == "amlp": 103 | d_attn = neox_args.gmlp_attn_dim 104 | else: 105 | d_attn = None 106 | self.sgu = SpatialGatingUnit( 107 | neox_args, ff_dim_parallel, d_attn, causal=True, mask_fn=mask_fn 108 | ) 109 | self.output_linear = mpu.RowParallelLinear( 110 | neox_args=neox_args, 111 | input_size=ff_dim, 112 | output_size=neox_args.hidden_size, 113 | input_is_parallel=True, 114 | init_method=output_layer_init_method, 115 | skip_bias_add=True, 116 | ) 117 | 118 | def forward(self, args): 119 | assert len(args) == 2, "GMLPBlock expects 2 arguments" 120 | x, attention_mask = args 121 | x = self.norm(x) 122 | x, _ = self.input_linear(x) 123 | x = self.activation_func(x) 124 | x = self.sgu(x, attention_mask) 125 | x, _ = self.output_linear(x) 126 | return x, attention_mask 127 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_load.py: -------------------------------------------------------------------------------- 1 | """ 2 | load all confings in neox/configs in order to perform validations implemented in NeoXArgs 3 | """ 4 | import pytest 5 | import yaml 6 | from ..common import get_configs_with_path 7 | 8 | 9 | def run_neox_args_load_test(yaml_files): 10 | from megatron.neox_arguments import NeoXArgs 11 | 12 | yaml_list = get_configs_with_path(yaml_files) 13 | args_loaded = NeoXArgs.from_ymls(yaml_list) 14 | assert isinstance(args_loaded, NeoXArgs) 15 | 16 | # initialize an empty config dictionary to be filled by yamls 17 | config = dict() 18 | 19 | # iterate of all to be loaded yaml files 20 | for conf_file_name in yaml_list: 21 | 22 | # load file 23 | with open(conf_file_name) as conf_file: 24 | conf = yaml.load(conf_file, Loader=yaml.FullLoader) 25 | 26 | # check for key duplicates and load values 27 | for conf_key, conf_value in conf.items(): 28 | if conf_key in config: 29 | raise ValueError( 30 | f"Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}" 31 | ) 32 | 33 | conf_key_converted = conf_key.replace( 34 | "-", "_" 35 | ) # TODO remove replace and update configuration files? 36 | config[conf_key_converted] = conf_value 37 | 38 | # validate that neox args has the same value as specified in the config (if specified in the config) 39 | for k, v in config.items(): 40 | neox_args_value = getattr(args_loaded, k) 41 | assert v == neox_args_value, ( 42 | "loaded neox args value " 43 | + str(k) 44 | + " == " 45 | + str(neox_args_value) 46 | + " different from config file " 47 | + str(v) 48 | ) 49 | 50 | 51 | @pytest.mark.cpu 52 | def test_neoxargs_load_arguments_small_local_setup(): 53 | """ 54 | verify small.yml can be loaded without raising validation errors 55 | """ 56 | run_neox_args_load_test(["small.yml", "local_setup.yml"]) 57 | 58 | 59 | @pytest.mark.cpu 60 | def test_neoxargs_load_arguments_small_local_setup_text_generation(): 61 | """ 62 | verify small.yml can be loaded together with text generation without raising validation errors 63 | """ 64 | run_neox_args_load_test(["small.yml", "local_setup.yml", "text_generation.yml"]) 65 | 66 | 67 | @pytest.mark.cpu 68 | def test_neoxargs_load_arguments_medium_local_setup(): 69 | """ 70 | verify medium.yml can be loaded without raising validation errors 71 | """ 72 | run_neox_args_load_test(["medium.yml", "local_setup.yml"]) 73 | 74 | 75 | @pytest.mark.cpu 76 | def test_neoxargs_load_arguments_large_local_setup(): 77 | """ 78 | verify large.yml can be loaded without raising validation errors 79 | """ 80 | run_neox_args_load_test(["large.yml", "local_setup.yml"]) 81 | 82 | 83 | @pytest.mark.cpu 84 | def test_neoxargs_load_arguments_2_7B_local_setup(): 85 | """ 86 | verify 2-7B.yml can be loaded without raising validation errors 87 | """ 88 | run_neox_args_load_test(["2-7B.yml", "local_setup.yml"]) 89 | 90 | 91 | @pytest.mark.cpu 92 | def test_neoxargs_load_arguments_6_7B_local_setup(): 93 | """ 94 | verify 6-7B.yml can be loaded without raising validation errors 95 | """ 96 | run_neox_args_load_test(["6-7B.yml", "local_setup.yml"]) 97 | 98 | 99 | @pytest.mark.cpu 100 | def test_neoxargs_load_arguments_13B_local_setup(): 101 | """ 102 | verify 13B.yml can be loaded without raising validation errors 103 | """ 104 | run_neox_args_load_test(["13B.yml", "local_setup.yml"]) 105 | 106 | 107 | @pytest.mark.cpu 108 | def test_neoxargs_load_arguments_XL_local_setup(): 109 | """ 110 | verify XL.yml can be loaded without raising validation errors 111 | """ 112 | run_neox_args_load_test(["XL.yml", "local_setup.yml"]) 113 | 114 | 115 | @pytest.mark.cpu 116 | def test_neoxargs_load_arguments_175B_local_setup(): 117 | """ 118 | verify 13B.yml can be loaded without raising validation errors 119 | """ 120 | run_neox_args_load_test(["175B.yml", "local_setup.yml"]) 121 | 122 | 123 | @pytest.mark.cpu 124 | def test_neoxargs_fail_instantiate_without_required_params(): 125 | """ 126 | verify assertion error if required arguments are not provided 127 | """ 128 | 129 | try: 130 | run_neox_args_load_test(["local_setup.yml"]) 131 | assert False 132 | except Exception as e: 133 | assert True 134 | 135 | 136 | @pytest.mark.cpu 137 | def test_neoxargs_fail_instantiate_without_any_params(): 138 | """ 139 | verify assertion error if required arguments are not provided 140 | """ 141 | from megatron.neox_arguments import NeoXArgs 142 | 143 | try: 144 | args_loaded = NeoXArgs() 145 | assert False 146 | except Exception as e: 147 | assert True 148 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "scaled_masked_softmax.h" 25 | #include "type_shim.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_masked_softmax { 30 | 31 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads) 32 | { 33 | return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads); 34 | } 35 | 36 | torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor) 37 | { 38 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 39 | const int batches = input.size(0); 40 | const int pad_batches = mask.size(0); 41 | const int attn_heads = input.size(1); 42 | const int query_seq_len = input.size(2); 43 | const int key_seq_len = input.size(3); 44 | TORCH_INTERNAL_ASSERT(key_seq_len <= 2048); 45 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 46 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 47 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 48 | TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len); 49 | TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len); 50 | 51 | // Output 52 | auto act_options = input.options().requires_grad(false); 53 | torch::Tensor softmax_results = 54 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 55 | 56 | // Softmax Intermediate Result Ptr 57 | void* input_ptr = static_cast(input.data_ptr()); 58 | void* mask_ptr = static_cast(mask.data_ptr()); 59 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 60 | 61 | DISPATCH_HALF_AND_BFLOAT(input.scalar_type(), 62 | "dispatch_scaled_masked_softmax_forward", 63 | dispatch_scaled_masked_softmax_forward( 64 | reinterpret_cast(softmax_results_ptr), 65 | reinterpret_cast(input_ptr), 66 | reinterpret_cast(mask_ptr), 67 | scale_factor, 68 | query_seq_len, 69 | key_seq_len, 70 | batches, 71 | attn_heads, 72 | pad_batches);); 73 | return softmax_results; 74 | } 75 | 76 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads_, 77 | torch::Tensor const& softmax_results_, 78 | float scale_factor) 79 | { 80 | auto output_grads = output_grads_.contiguous(); 81 | auto softmax_results = softmax_results_.contiguous(); 82 | 83 | // output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 84 | const int batches = output_grads.size(0); 85 | const int attn_heads = output_grads.size(1); 86 | const int query_seq_len = output_grads.size(2); 87 | const int key_seq_len = output_grads.size(3); 88 | 89 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 90 | 91 | // Softmax Grad 92 | DISPATCH_HALF_AND_BFLOAT(output_grads_.scalar_type(), 93 | "dispatch_scaled_masked_softmax_backward", 94 | dispatch_scaled_masked_softmax_backward( 95 | reinterpret_cast(output_grads_ptr), 96 | reinterpret_cast(output_grads_ptr), 97 | reinterpret_cast(softmax_results.data_ptr()), 98 | scale_factor, 99 | query_seq_len, 100 | key_seq_len, 101 | batches, 102 | attn_heads);); 103 | 104 | // backward pass is completely in-place 105 | return output_grads; 106 | } 107 | } // namespace scaled_masked_softmax 108 | } // namespace fused_softmax 109 | } // namespace multihead_attn 110 | -------------------------------------------------------------------------------- /megatron/mpu/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group 19 | from .initialize import get_model_parallel_rank 20 | from .initialize import get_model_parallel_world_size 21 | from .utils import VocabUtility 22 | 23 | 24 | class _VocabParallelCrossEntropy(torch.autograd.Function): 25 | @staticmethod 26 | def forward(ctx, vocab_parallel_logits, target): 27 | 28 | # Maximum value along vocab dimension across all GPUs. 29 | logits_max = torch.max(vocab_parallel_logits, dim=-1)[0] 30 | torch.distributed.all_reduce( 31 | logits_max, 32 | op=torch.distributed.ReduceOp.MAX, 33 | group=get_model_parallel_group(), 34 | ) 35 | # Subtract the maximum value. 36 | vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1)) 37 | 38 | # Get the partition's vocab indecies 39 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 40 | partition_vocab_size = vocab_parallel_logits.size()[-1] 41 | rank = get_model_parallel_rank() 42 | world_size = get_model_parallel_world_size() 43 | vocab_start_index, vocab_end_index = get_vocab_range( 44 | partition_vocab_size, rank, world_size 45 | ) 46 | 47 | # Create a mask of valid vocab ids (1 means it needs to be masked). 48 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 49 | masked_target = target.clone() - vocab_start_index 50 | masked_target[target_mask] = 0 51 | 52 | # Get predicted-logits = logits[target]. 53 | # For Simplicity, we convert logits to a 2-D tensor with size 54 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 55 | logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size) 56 | masked_target_1d = masked_target.view(-1) 57 | arange_1d = torch.arange( 58 | start=0, end=logits_2d.size()[0], device=logits_2d.device 59 | ) 60 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 61 | predicted_logits_1d = predicted_logits_1d.clone().contiguous() 62 | predicted_logits = predicted_logits_1d.view_as(target) 63 | predicted_logits[target_mask] = 0.0 64 | # All reduce is needed to get the chunks from other GPUs. 65 | torch.distributed.all_reduce( 66 | predicted_logits, 67 | op=torch.distributed.ReduceOp.SUM, 68 | group=get_model_parallel_group(), 69 | ) 70 | 71 | # Sum of exponential of logits along vocab dimension across all GPUs. 72 | exp_logits = vocab_parallel_logits 73 | torch.exp(vocab_parallel_logits, out=exp_logits) 74 | sum_exp_logits = exp_logits.sum(dim=-1) 75 | torch.distributed.all_reduce( 76 | sum_exp_logits, 77 | op=torch.distributed.ReduceOp.SUM, 78 | group=get_model_parallel_group(), 79 | ) 80 | 81 | # Loss = log(sum(exp(logits))) - predicted-logit. 82 | loss = torch.log(sum_exp_logits) - predicted_logits 83 | 84 | # Store softmax, target-mask and masked-target for backward pass. 85 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 86 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 87 | 88 | return loss 89 | 90 | @staticmethod 91 | def backward(ctx, grad_output): 92 | 93 | # Retreive tensors from the forward path. 94 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 95 | 96 | # All the inputs have softmax as thier gradient. 97 | grad_input = softmax 98 | # For simplicity, work with the 2D gradient. 99 | partition_vocab_size = softmax.size()[-1] 100 | grad_2d = grad_input.view(-1, partition_vocab_size) 101 | 102 | # Add the gradient from matching classes. 103 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device) 104 | grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float() 105 | 106 | # Finally elementwise multiplication with the output gradients. 107 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 108 | 109 | return grad_input, None 110 | 111 | 112 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target): 113 | """Helper function for the cross entropy.""" 114 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) 115 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | # Refer to the following link for the explanation of each params: 3 | # http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html 4 | Language: Cpp 5 | # BasedOnStyle: Google 6 | AccessModifierOffset: -4 7 | AlignAfterOpenBracket: Align 8 | AlignConsecutiveAssignments: false 9 | AlignConsecutiveDeclarations: false 10 | AlignEscapedNewlines: Left 11 | AlignOperands: true 12 | AlignTrailingComments: true 13 | AllowAllParametersOfDeclarationOnNextLine: false 14 | AllowShortBlocksOnASingleLine: true 15 | AllowShortCaseLabelsOnASingleLine: true 16 | AllowShortFunctionsOnASingleLine: All 17 | AllowShortIfStatementsOnASingleLine: true 18 | AllowShortLoopsOnASingleLine: true 19 | # This is deprecated 20 | AlwaysBreakAfterDefinitionReturnType: None 21 | AlwaysBreakAfterReturnType: None 22 | AlwaysBreakBeforeMultilineStrings: true 23 | AlwaysBreakTemplateDeclarations: true 24 | BinPackArguments: false 25 | BinPackParameters: false 26 | BraceWrapping: 27 | AfterClass: false 28 | AfterControlStatement: false 29 | AfterEnum: false 30 | AfterFunction: false 31 | AfterNamespace: false 32 | AfterObjCDeclaration: false 33 | AfterStruct: false 34 | AfterUnion: false 35 | AfterExternBlock: false 36 | BeforeCatch: false 37 | BeforeElse: false 38 | IndentBraces: false 39 | # disabling the below splits, else, they'll just add to the vertical length of source files! 40 | SplitEmptyFunction: false 41 | SplitEmptyRecord: false 42 | SplitEmptyNamespace: false 43 | BreakBeforeBinaryOperators: None 44 | BreakBeforeBraces: WebKit 45 | BreakBeforeInheritanceComma: false 46 | BreakInheritanceList: BeforeColon 47 | BreakBeforeTernaryOperators: true 48 | BreakConstructorInitializersBeforeComma: false 49 | BreakConstructorInitializers: BeforeColon 50 | BreakAfterJavaFieldAnnotations: false 51 | BreakStringLiterals: true 52 | ColumnLimit: 100 53 | CommentPragmas: '^ IWYU pragma:' 54 | CompactNamespaces: false 55 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 56 | # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform 57 | ConstructorInitializerIndentWidth: 4 58 | ContinuationIndentWidth: 4 59 | Cpp11BracedListStyle: true 60 | DerivePointerAlignment: false 61 | DisableFormat: false 62 | ExperimentalAutoDetectBinPacking: false 63 | FixNamespaceComments: true 64 | ForEachMacros: 65 | - foreach 66 | - Q_FOREACH 67 | - BOOST_FOREACH 68 | IncludeBlocks: Preserve 69 | IncludeCategories: 70 | - Regex: '^' 71 | Priority: 2 72 | - Regex: '^<.*\.h>' 73 | Priority: 1 74 | - Regex: '^<.*' 75 | Priority: 2 76 | - Regex: '.*' 77 | Priority: 3 78 | IncludeIsMainRegex: '([-_](test|unittest))?$' 79 | IndentCaseLabels: true 80 | IndentPPDirectives: None 81 | IndentWidth: 4 82 | IndentWrappedFunctionNames: false 83 | JavaScriptQuotes: Leave 84 | JavaScriptWrapImports: true 85 | KeepEmptyLinesAtTheStartOfBlocks: false 86 | MacroBlockBegin: '' 87 | MacroBlockEnd: '' 88 | MaxEmptyLinesToKeep: 1 89 | NamespaceIndentation: None 90 | ObjCBinPackProtocolList: Never 91 | ObjCBlockIndentWidth: 4 92 | ObjCSpaceAfterProperty: false 93 | ObjCSpaceBeforeProtocolList: true 94 | PenaltyBreakAssignment: 4 95 | PenaltyBreakBeforeFirstCallParameter: 1 96 | PenaltyBreakComment: 300 97 | PenaltyBreakFirstLessLess: 120 98 | PenaltyBreakString: 1000 99 | PenaltyBreakTemplateDeclaration: 10 100 | PenaltyExcessCharacter: 1000000 101 | PenaltyReturnTypeOnItsOwnLine: 200 102 | PointerAlignment: Left 103 | RawStringFormats: 104 | - Language: Cpp 105 | Delimiters: 106 | - cc 107 | - CC 108 | - cpp 109 | - Cpp 110 | - CPP 111 | - 'c++' 112 | - 'C++' 113 | CanonicalDelimiter: '' 114 | - Language: TextProto 115 | Delimiters: 116 | - pb 117 | - PB 118 | - proto 119 | - PROTO 120 | EnclosingFunctions: 121 | - EqualsProto 122 | - EquivToProto 123 | - PARSE_PARTIAL_TEXT_PROTO 124 | - PARSE_TEST_PROTO 125 | - PARSE_TEXT_PROTO 126 | - ParseTextOrDie 127 | - ParseTextProtoOrDie 128 | CanonicalDelimiter: '' 129 | BasedOnStyle: google 130 | # Enabling comment reflow causes doxygen comments to be messed up in their formats! 131 | ReflowComments: true 132 | SortIncludes: true 133 | SortUsingDeclarations: true 134 | SpaceAfterCStyleCast: false 135 | SpaceAfterTemplateKeyword: true 136 | SpaceBeforeAssignmentOperators: true 137 | SpaceBeforeCpp11BracedList: false 138 | SpaceBeforeCtorInitializerColon: true 139 | SpaceBeforeInheritanceColon: true 140 | SpaceBeforeParens: ControlStatements 141 | SpaceBeforeRangeBasedForLoopColon: true 142 | SpaceInEmptyParentheses: false 143 | SpacesBeforeTrailingComments: 2 144 | SpacesInAngles: false 145 | SpacesInContainerLiterals: true 146 | SpacesInCStyleCastParentheses: false 147 | SpacesInParentheses: false 148 | SpacesInSquareBrackets: false 149 | Standard: Cpp11 150 | StatementMacros: 151 | - Q_UNUSED 152 | - QT_REQUIRE_VERSION 153 | # Be consistent with indent-width, even for people who use tab for indentation! 154 | TabWidth: 4 155 | UseTab: Never 156 | -------------------------------------------------------------------------------- /megatron/learning_rates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Learning rate decay functions.""" 16 | 17 | import math 18 | 19 | from megatron import print_rank_0 20 | 21 | 22 | class AnnealingLR(object): 23 | """Anneals the learning rate.""" 24 | 25 | def __init__( 26 | self, 27 | optimizer, 28 | start_lr, 29 | warmup_iter, 30 | total_iters, 31 | decay_style, 32 | last_iter, 33 | min_lr=0.0, 34 | use_checkpoint_lr_scheduler=True, 35 | override_lr_scheduler=False, 36 | ): 37 | 38 | # Class values. 39 | self.optimizer = optimizer 40 | self.start_lr = start_lr 41 | self.min_lr = min_lr 42 | self.warmup_iter = warmup_iter 43 | self.num_iters = last_iter 44 | self.end_iter = total_iters 45 | assert self.end_iter > 0 46 | self.decay_style = decay_style 47 | self.override_lr_scheduler = override_lr_scheduler 48 | self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler 49 | if self.override_lr_scheduler: 50 | assert not self.use_checkpoint_lr_scheduler, ( 51 | "both override and " "use-checkpoint are set." 52 | ) 53 | # Set the learning rate 54 | self.step(self.num_iters) 55 | 56 | print_rank_0("> learning rate decay style: {}".format(self.decay_style)) 57 | 58 | def get_lr(self): 59 | """Learning rate decay functions from: 60 | https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" 61 | 62 | num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter) 63 | # Warmup. 64 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: 65 | return float(self.start_lr) * num_iters_ / self.warmup_iter 66 | 67 | num_iters_ = num_iters_ - self.warmup_iter 68 | if self.decay_style == "linear": 69 | lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter 70 | elif self.decay_style == "cosine": 71 | lr = ( 72 | self.start_lr 73 | / 2.0 74 | * (math.cos(math.pi * num_iters_ / self.end_iter) + 1) 75 | ) 76 | elif self.decay_style == "exponential": 77 | # exp(-0.693) = 1/2 78 | lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter) 79 | else: 80 | lr = self.start_lr 81 | return max(lr, self.min_lr) 82 | 83 | def step(self, step_num=None): 84 | """Set lr for all parameters groups.""" 85 | if step_num is None: 86 | step_num = self.num_iters + 1 87 | self.num_iters = step_num 88 | new_lr = self.get_lr() 89 | for group in self.optimizer.param_groups: 90 | group["lr"] = new_lr 91 | 92 | def state_dict(self): 93 | state_dict = { 94 | "start_lr": self.start_lr, 95 | "warmup_iter": self.warmup_iter, 96 | "num_iters": self.num_iters, 97 | "decay_style": self.decay_style, 98 | "end_iter": self.end_iter, 99 | "min_lr": self.min_lr, 100 | } 101 | return state_dict 102 | 103 | def _check_and_set(self, cls_value, sd_value, name): 104 | """Auxiliary function for checking the values in the checkpoint and 105 | setting them.""" 106 | if self.override_lr_scheduler: 107 | print_rank_0(" > overriding {} value to {}".format(name, cls_value)) 108 | return cls_value 109 | 110 | if not self.use_checkpoint_lr_scheduler: 111 | assert cls_value == sd_value, ( 112 | "AnnealingLR: class input value" 113 | "and checkpoint values for {} do not match".format(name) 114 | ) 115 | print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name)) 116 | return sd_value 117 | 118 | def load_state_dict(self, sd): 119 | 120 | self.start_lr = self._check_and_set( 121 | self.start_lr, sd["start_lr"], "learning rate" 122 | ) 123 | self.min_lr = self._check_and_set( 124 | self.min_lr, sd["min_lr"], "minimum learning rate" 125 | ) 126 | self.warmup_iter = self._check_and_set( 127 | self.warmup_iter, sd["warmup_iter"], "warmup iterations" 128 | ) 129 | self.end_iter = self._check_and_set( 130 | self.end_iter, sd["end_iter"], "total number of iterations" 131 | ) 132 | self.decay_style = self._check_and_set( 133 | self.decay_style, sd["decay_style"], "decay style" 134 | ) 135 | 136 | self.num_iters = sd["num_iters"] 137 | self.step(self.num_iters) 138 | -------------------------------------------------------------------------------- /megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "compat.h" 19 | 20 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 21 | switch (TYPE) { \ 22 | case at::ScalarType::Half: { \ 23 | using scalar_t = at::Half; \ 24 | __VA_ARGS__; \ 25 | break; \ 26 | } \ 27 | case at::ScalarType::BFloat16: { \ 28 | using scalar_t = at::BFloat16; \ 29 | __VA_ARGS__; \ 30 | break; \ 31 | } \ 32 | default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 33 | } 34 | 35 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 36 | switch (TYPEIN) { \ 37 | case at::ScalarType::Float: { \ 38 | using scalar_t_in = float; \ 39 | switch (TYPEOUT) { \ 40 | case at::ScalarType::Float: { \ 41 | using scalar_t_out = float; \ 42 | __VA_ARGS__; \ 43 | break; \ 44 | } \ 45 | case at::ScalarType::Half: { \ 46 | using scalar_t_out = at::Half; \ 47 | __VA_ARGS__; \ 48 | break; \ 49 | } \ 50 | case at::ScalarType::BFloat16: { \ 51 | using scalar_t_out = at::BFloat16; \ 52 | __VA_ARGS__; \ 53 | break; \ 54 | } \ 55 | default: AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 56 | } \ 57 | break; \ 58 | } \ 59 | case at::ScalarType::Half: { \ 60 | using scalar_t_in = at::Half; \ 61 | using scalar_t_out = at::Half; \ 62 | __VA_ARGS__; \ 63 | break; \ 64 | } \ 65 | case at::ScalarType::BFloat16: { \ 66 | using scalar_t_in = at::BFloat16; \ 67 | using scalar_t_out = at::BFloat16; \ 68 | __VA_ARGS__; \ 69 | break; \ 70 | } \ 71 | default: AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 72 | } 73 | -------------------------------------------------------------------------------- /megatron/mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from .initialize import ( 18 | get_model_parallel_group, 19 | get_model_parallel_world_size, 20 | get_model_parallel_rank, 21 | get_fp32_allreduce, 22 | ) 23 | from .utils import split_tensor_along_last_dim 24 | 25 | 26 | def _reduce(input_): 27 | """All-reduce the the input tensor across model parallel group.""" 28 | 29 | # Bypass the function if we are using only 1 GPU. 30 | if get_model_parallel_world_size() == 1: 31 | return input_ 32 | 33 | # Bf16 convert 34 | dt = input_.dtype 35 | if dt == torch.bfloat16 and get_fp32_allreduce(): 36 | input_ = input_.float() 37 | 38 | # All-reduce. 39 | torch.distributed.all_reduce(input_, group=get_model_parallel_group()) 40 | 41 | # Bf16 convert 42 | if dt == torch.bfloat16 and get_fp32_allreduce(): 43 | input_ = input_.bfloat16() 44 | 45 | return input_ 46 | 47 | 48 | def _split(input_): 49 | """Split the tensor along its last dimension and keep the 50 | corresponding slice.""" 51 | 52 | world_size = get_model_parallel_world_size() 53 | # Bypass the function if we are using only 1 GPU. 54 | if world_size == 1: 55 | return input_ 56 | 57 | # Bf16 convert 58 | dt = input_.dtype 59 | if dt == torch.bfloat16 and get_fp32_allreduce(): 60 | input_ = input_.float() 61 | 62 | # Split along last dimension. 63 | input_list = split_tensor_along_last_dim(input_, world_size) 64 | 65 | # Note: torch.split does not create contiguous tensors by default. 66 | rank = get_model_parallel_rank() 67 | output = input_list[rank].contiguous() 68 | 69 | # Bf16 convert 70 | if dt == torch.bfloat16 and get_fp32_allreduce(): 71 | output = output.bfloat16() 72 | 73 | return output 74 | 75 | 76 | def _gather(input_): 77 | """Gather tensors and concatinate along the last dimension.""" 78 | 79 | world_size = get_model_parallel_world_size() 80 | # Bypass the function if we are using only 1 GPU. 81 | if world_size == 1: 82 | return input_ 83 | 84 | # Bf16 convert 85 | dt = input_.dtype 86 | if dt == torch.bfloat16 and get_fp32_allreduce(): 87 | input_ = input_.float() 88 | 89 | # Size and dimension. 90 | last_dim = input_.dim() - 1 91 | rank = get_model_parallel_rank() 92 | 93 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 94 | tensor_list[rank] = input_ 95 | torch.distributed.all_gather(tensor_list, input_, group=get_model_parallel_group()) 96 | 97 | # Note: torch.cat already creates a contiguous tensor. 98 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 99 | 100 | # Bf16 convert 101 | if dt == torch.bfloat16 and get_fp32_allreduce(): 102 | output = output.bfloat16() 103 | 104 | return output 105 | 106 | 107 | class _CopyToModelParallelRegion(torch.autograd.Function): 108 | """Pass the input to the model parallel region.""" 109 | 110 | @staticmethod 111 | def symbolic(graph, input_): 112 | return input_ 113 | 114 | @staticmethod 115 | def forward(ctx, input_): 116 | return input_ 117 | 118 | @staticmethod 119 | def backward(ctx, grad_output): 120 | return _reduce(grad_output) 121 | 122 | 123 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 124 | """All-reduce the input from the model parallel region.""" 125 | 126 | @staticmethod 127 | def symbolic(graph, input_): 128 | return _reduce(input_) 129 | 130 | @staticmethod 131 | def forward(ctx, input_): 132 | return _reduce(input_) 133 | 134 | @staticmethod 135 | def backward(ctx, grad_output): 136 | return grad_output 137 | 138 | 139 | class _ScatterToModelParallelRegion(torch.autograd.Function): 140 | """Split the input and keep only the corresponding chuck to the rank.""" 141 | 142 | @staticmethod 143 | def symbolic(graph, input_): 144 | return _split(input_) 145 | 146 | @staticmethod 147 | def forward(ctx, input_): 148 | return _split(input_) 149 | 150 | @staticmethod 151 | def backward(ctx, grad_output): 152 | return _gather(grad_output) 153 | 154 | 155 | class _GatherFromModelParallelRegion(torch.autograd.Function): 156 | """Gather the input from model parallel region and concatinate.""" 157 | 158 | @staticmethod 159 | def symbolic(graph, input_): 160 | return _gather(input_) 161 | 162 | @staticmethod 163 | def forward(ctx, input_): 164 | return _gather(input_) 165 | 166 | @staticmethod 167 | def backward(ctx, grad_output): 168 | return _split(grad_output) 169 | 170 | 171 | # ----------------- 172 | # Helper functions. 173 | # ----------------- 174 | 175 | 176 | def copy_to_model_parallel_region(input_): 177 | return _CopyToModelParallelRegion.apply(input_) 178 | 179 | 180 | def reduce_from_model_parallel_region(input_): 181 | return _ReduceFromModelParallelRegion.apply(input_) 182 | 183 | 184 | def scatter_to_model_parallel_region(input_): 185 | return _ScatterToModelParallelRegion.apply(input_) 186 | 187 | 188 | def gather_from_model_parallel_region(input_): 189 | return _GatherFromModelParallelRegion.apply(input_) 190 | -------------------------------------------------------------------------------- /tests/model/test_model_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | instantiate models, save checkpoints, load checkpoints, compare loaded parameters to saved parameters and compare forward pass outputs 3 | 4 | This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs 5 | to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures. 6 | """ 7 | import pytest 8 | 9 | from ..common import distributed_test, clear_test_dirs, model_setup, binary, parametrize 10 | 11 | import torch 12 | import os 13 | 14 | PARAMS_TO_TEST = { 15 | "norm,pos_emb,activation": [ 16 | ["layernorm", "learned", "gelu"], 17 | ["rmsnorm", "rotary", "relu"], 18 | ["scalenorm", "sinusoidal", "mish"], 19 | ["layernorm", "rpe", "geglu"], 20 | ["rmsnorm", "none", "swish"], 21 | ], 22 | "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2]], 23 | "no_weight_tying": binary, 24 | "attention_config,num_layers": [ 25 | [[[["global"], "all"]], 2], 26 | [[[["local", "global"], "all"]], 12], 27 | [[[["sparse_variable", "global"], "all"]], 12], 28 | [[[["sparse_fixed", "global"], "all"]], 12], 29 | ], # the sparse attention models need more layers to be stable 30 | "scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [ 31 | [True, False], 32 | [False, True], 33 | ], 34 | "checkpoint_activations": binary, 35 | "log_gradient_noise_scale": [True], 36 | "sparsity_config": [ 37 | { 38 | "block": 16, # block size 39 | "num_local_blocks": 32, 40 | } 41 | ], 42 | } 43 | 44 | 45 | parameters, names = parametrize( 46 | PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 47 | ) 48 | 49 | 50 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 51 | def test_train(param_dict): 52 | @distributed_test(world_size=2) 53 | def wrapper(): 54 | run_train_test(param_dict=param_dict) 55 | 56 | wrapper() 57 | 58 | 59 | BF16_PARAMS_TO_TEST = { 60 | "fp16,fp32_allreduce": [ 61 | [ 62 | { 63 | "enabled": True, 64 | "type": "bfloat16", 65 | "loss_scale": 0, 66 | "loss_scale_window": 1000, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1, 69 | }, 70 | True, 71 | ], 72 | [ 73 | { 74 | "enabled": True, 75 | "loss_scale": 0, 76 | "loss_scale_window": 1000, 77 | "hysteresis": 2, 78 | "min_loss_scale": 1, 79 | }, 80 | False, 81 | ], 82 | ] 83 | } 84 | 85 | parameters, names = parametrize( 86 | BF16_PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 87 | ) 88 | 89 | 90 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 91 | def test_train_bf16(param_dict): 92 | @distributed_test(world_size=2) 93 | def wrapper(): 94 | run_train_test(param_dict=param_dict) 95 | 96 | wrapper() 97 | 98 | 99 | OPTIMIZER_PARAMS = { 100 | "optimizer": [ 101 | {"type": "adam", "params": {"lr": 0.0006}}, 102 | {"type": "onebitadam", "params": {"lr": 0.0006}}, 103 | {"type": "cpu_adam", "params": {"lr": 0.0006}}, 104 | {"type": "cpu_torch_adam", "params": {"lr": 0.0006}}, 105 | {"type": "sm3", "params": {"lr": 0.0006}}, 106 | {"type": "madgrad_wd", "params": {"lr": 0.0006}}, 107 | ] 108 | } 109 | opt_params, opt_name = parametrize( 110 | OPTIMIZER_PARAMS, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None 111 | ) 112 | 113 | 114 | @pytest.mark.parametrize("param_dict", parameters, ids=names) 115 | def test_train_optimizers(param_dict): 116 | @distributed_test(world_size=2) 117 | def wrapper(): 118 | run_train_test(param_dict=param_dict) 119 | 120 | wrapper() 121 | 122 | 123 | def run_train_test(yaml_list=None, param_dict=None): 124 | from megatron.training import train_step 125 | from megatron.utils import Timers 126 | 127 | max_steps = 64 128 | 129 | model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict) 130 | 131 | model.train() 132 | 133 | timers = Timers(use_wandb=False, tensorboard_writer=None) 134 | 135 | # generate some random data on which we can overfit 136 | # context size of data is model seq_len + 1 in order to compute loss 137 | data_list = list() 138 | context_tokens_tensor = torch.randint( 139 | 0, args_loaded.padded_vocab_size, (4, args_loaded.seq_length + 1) 140 | ).to(torch.int64) 141 | for i in range(max_steps): 142 | data_list.append({"text": context_tokens_tensor.clone()}) 143 | data_iterator = iter(data_list) 144 | 145 | # run train_step until the loss decreases 146 | losses = list() 147 | for i in range(max_steps): 148 | loss_dict, skipped_iter = train_step( 149 | neox_args=args_loaded, 150 | timers=timers, 151 | data_iterator=data_iterator, 152 | model=model, 153 | optimizer=optimizer, 154 | lr_scheduler=lr_scheduler, 155 | ) 156 | losses.append(loss_dict["lm_loss"]) 157 | if len(losses) >= 2: 158 | if torch.isnan(losses[-1]): 159 | continue 160 | if torch.isnan(losses[-2]): 161 | continue 162 | if losses[-1] < losses[-2]: 163 | return # all good 164 | 165 | # loss should have decreased by now (otherwise increasing the max_steps parameter could have the testcase pass) 166 | assert losses[-1] < losses[-2], ( 167 | "run_train_test() loss going down within " + str(max_steps) + " steps" 168 | ) 169 | 170 | if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0: 171 | clear_test_dirs() 172 | -------------------------------------------------------------------------------- /megatron/model/positional_embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | 5 | class SinusoidalPositionalEmbedding(torch.nn.Module): 6 | def __init__(self, dim, base=10000, precision=torch.half): 7 | super().__init__() 8 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) 9 | self.register_buffer("inv_freq", inv_freq) 10 | self.precision = precision 11 | 12 | def forward(self, x, seq_dim=1): 13 | t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) 14 | sinusoid_inp = torch.einsum("i,j->ij", t, self.inv_freq) 15 | if self.precision == torch.bfloat16: 16 | sinusoid_inp = sinusoid_inp.float() 17 | sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos() 18 | if self.precision == torch.bfloat16: 19 | sin, cos = sin.bfloat16(), cos.bfloat16() 20 | emb = torch.cat((sin, cos), dim=-1) 21 | return emb[None, :, :] 22 | 23 | 24 | class RotaryEmbedding(torch.nn.Module): 25 | def __init__(self, dim, base=10000, precision=torch.half): 26 | super().__init__() 27 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) 28 | self.register_buffer("inv_freq", inv_freq) 29 | self.seq_len_cached = None 30 | self.cos_cached = None 31 | self.sin_cached = None 32 | self.precision = precision 33 | 34 | def forward(self, x, seq_dim=1, seq_len=None): 35 | if seq_len is None: 36 | seq_len = x.shape[seq_dim] 37 | if seq_len != self.seq_len_cached: 38 | self.seq_len_cached = seq_len 39 | t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq) 40 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 41 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 42 | if self.precision == torch.bfloat16: 43 | emb = emb.float() 44 | self.cos_cached = emb.cos()[:, None, None, :] 45 | self.sin_cached = emb.sin()[:, None, None, :] 46 | if self.precision == torch.bfloat16: 47 | self.cos_cached = self.cos_cached.bfloat16() 48 | self.sin_cached = self.sin_cached.bfloat16() 49 | return self.cos_cached, self.sin_cached 50 | 51 | 52 | # rotary pos emb helpers: 53 | 54 | 55 | def rotate_half(x): 56 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] 57 | return torch.cat( 58 | (-x2, x1), dim=x1.ndim - 1 59 | ) # dim=-1 triggers a bug in earlier torch versions 60 | 61 | 62 | @torch.jit.script 63 | def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): 64 | cos, sin = ( 65 | cos[offset : q.shape[0] + offset, ...], 66 | sin[offset : q.shape[0] + offset, ...], 67 | ) 68 | return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) 69 | 70 | 71 | def apply_rotary_pos_emb_torch( 72 | q, k, cos, sin, offset: int = 0 73 | ): # jitting fails with bf16 74 | cos, sin = ( 75 | cos[offset : q.shape[0] + offset, ...], 76 | sin[offset : q.shape[0] + offset, ...], 77 | ) 78 | return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) 79 | 80 | 81 | class AliBi(torch.nn.Module): 82 | def __init__(self, num_heads, mp_size=1, mp_rank=1): 83 | super().__init__() 84 | # megatron splits across heads, so we need to make sure each 85 | # head receives the correct matrix 86 | assert mp_size <= num_heads and mp_rank <= mp_size 87 | self.mp_size = mp_size 88 | self.mp_rank = mp_rank 89 | self.num_heads = num_heads 90 | self.slice_size = num_heads // mp_size 91 | self.cached_matrix = None 92 | self.cached_seq_len = None 93 | slopes = torch.Tensor(self._get_slopes(num_heads))[ 94 | mp_rank * self.slice_size : (mp_rank + 1) * self.slice_size 95 | ] 96 | self.register_buffer("slopes", slopes) 97 | 98 | def _get_slopes(self, n): 99 | """ 100 | Get slopes for Alibi positional embedding 101 | n : int = number of heads. 102 | For best performance, restrict n to a power of 2. 103 | """ 104 | 105 | def get_slopes_power_of_2(n): 106 | start = 2 ** (-(2 ** -(math.log2(n) - 3))) 107 | ratio = start 108 | return [start * ratio ** i for i in range(n)] 109 | 110 | if math.log2(n).is_integer(): 111 | return get_slopes_power_of_2(n) 112 | else: 113 | closest_power_of_2 = 2 ** math.floor(math.log2(n)) 114 | return ( 115 | get_slopes_power_of_2(closest_power_of_2) 116 | + self._get_slopes(2 * closest_power_of_2)[0::2][ 117 | : n - closest_power_of_2 118 | ] 119 | ) 120 | 121 | def forward(self, x): 122 | # [b, np, sq, sk] 123 | seq_len_q = x.shape[-2] 124 | seq_len_k = x.shape[-1] 125 | if self.cached_seq_len != seq_len_k: 126 | a = -torch.tril( 127 | torch.arange(seq_len_k).view(seq_len_k, 1).repeat(1, seq_len_k) 128 | + torch.arange(0, -seq_len_k, -1) 129 | ) 130 | a = a.to(x.device).to(x.dtype) 131 | slopes = self.slopes.to(a.device).to(a.dtype) 132 | a = a * slopes.view(self.slopes.shape[0], 1, 1) 133 | self.cached_seq_len = seq_len_k 134 | self.cached_matrix = a 135 | else: 136 | a = self.cached_matrix 137 | 138 | if seq_len_q != seq_len_k: 139 | # In the train case x has dimensionality [b, np, sq, sk] with sq == sk 140 | # The number of query tokens is equal to the number of key tokens 141 | # At inference time with cache in layer_past sq is not equal to sk. sq only contains one token (the last one in the full sequence) 142 | # In this case we use the appropriate token index of the cache matrix. 143 | # As the cache matrix could already be bigger from a past inference, not the last token index in the sq sequence is used 144 | assert ( 145 | seq_len_q == 1 146 | ), "assumption sq == sk unless at inference time with cache in layer_past with sq == 1" 147 | a = a[:, seq_len_k - 1, :].view( 148 | a.shape[0], 1, a.shape[2] 149 | ) # seq_len_k - 1 points to the last token index in the current inference batch. 150 | 151 | return x + a 152 | -------------------------------------------------------------------------------- /megatron/data/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Batch samplers that work with either random or sequential data samplers.""" 16 | 17 | import torch 18 | from torch.utils import data 19 | 20 | 21 | class RandomSampler(data.sampler.Sampler): 22 | """Based off of pytorch RandomSampler and DistributedSampler. Essentially 23 | a RandomSampler, but this class lets the user set an epoch like 24 | DistributedSampler Samples elements randomly. If without replacement, then 25 | sample from a shuffled dataset. If with replacement, then user can 26 | specify ``num_samples`` to draw. 27 | Arguments: 28 | data_source (Dataset): dataset to sample from 29 | num_samples (int): number of samples to draw, default=len(dataset) 30 | replacement (bool): samples are drawn with replacement if ``True``, 31 | default=False 32 | """ 33 | 34 | def __init__(self, data_source, replacement=False, num_samples=None): 35 | self.data_source = data_source 36 | self.replacement = replacement 37 | self._num_samples = num_samples 38 | self.epoch = -1 39 | 40 | if self._num_samples is not None and replacement is False: 41 | raise ValueError( 42 | "With replacement=False, num_samples should not " 43 | "be specified, since a random permute will be " 44 | "performed." 45 | ) 46 | 47 | if not isinstance(self.num_samples, int) or self.num_samples <= 0: 48 | raise ValueError( 49 | "num_samples should be a positive integer " 50 | "value, but got num_samples={}".format(self.num_samples) 51 | ) 52 | if not isinstance(self.replacement, bool): 53 | raise ValueError( 54 | "replacement should be a boolean value, but got " 55 | "replacement={}".format(self.replacement) 56 | ) 57 | 58 | @property 59 | def num_samples(self): 60 | # dataset size might change at runtime 61 | if self._num_samples is None: 62 | return len(self.data_source) 63 | return self._num_samples 64 | 65 | def __iter__(self): 66 | n = len(self.data_source) 67 | g = torch.Generator() 68 | if self.epoch >= 0: 69 | g.manual_seed(self.epoch) 70 | if self.replacement: 71 | return iter( 72 | torch.randint( 73 | high=n, size=(self.num_samples,), dtype=torch.int64, generator=g 74 | ).tolist() 75 | ) 76 | return iter(torch.randperm(n, generator=g).tolist()) 77 | 78 | def __len__(self): 79 | return self.num_samples 80 | 81 | def set_epoch(self, epoch): 82 | self.epoch = epoch 83 | 84 | 85 | class DistributedBatchSampler(data.sampler.BatchSampler): 86 | """Similar to normal implementation of distributed sampler, except 87 | implementation is at the batch sampler level, instead of just the 88 | sampler level. This allows wrapping of arbitrary data samplers 89 | (sequential, random, WeightedRandomSampler, etc.) with this batch 90 | sampler. 91 | 92 | The `interleave` argument specifies how to distribute a batch. A value 93 | of True combined with the above random sampler is equivalent to pytorch's 94 | torch.utils.data.distributed.DistributedSampler. 95 | 96 | For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2 97 | specifying True will result in the following samples for each gpu: 98 | GPU0: [0,2,4,6] GPU1: [1,3,5,7] 99 | specifying False will result in the following samples: 100 | GPU0: [0,1,2,3] GPU1: [4,5,6,7]""" 101 | 102 | def __init__( 103 | self, 104 | sampler, 105 | batch_size, 106 | drop_last, 107 | rank=-1, 108 | world_size=2, 109 | wrap_last=False, 110 | interleave=False, 111 | ): 112 | super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) 113 | if rank == -1: 114 | assert False, "should not be here" 115 | rank = torch.distributed.get_rank() 116 | self.rank = rank 117 | self.world_size = world_size 118 | self.sampler.wrap_around = 0 119 | self.wrap_around = 0 120 | self.wrap_last = wrap_last 121 | self.start_iter = 0 122 | self.interleave = interleave 123 | 124 | def __iter__(self): 125 | batch = [] 126 | i = 0 127 | for idx in self.data_iterator(self.sampler, wrap_around=False): 128 | batch.append(idx) 129 | if len(batch) == self.batch_size: 130 | tbatch = self._batch(batch) 131 | if i >= self.start_iter: 132 | yield tbatch 133 | self.start_iter = 0 134 | i += 1 135 | batch = [] 136 | batch_len = len(batch) 137 | if batch_len > 0 and not self.drop_last: 138 | if self.wrap_last: 139 | self.sampler.wrap_around -= self.batch_size 140 | self.wrap_around += len(batch) 141 | self.wrap_around %= self.batch_size 142 | yield self._batch(batch) 143 | if self.wrap_last: 144 | self.sampler.wrap_around += self.batch_size 145 | 146 | def data_iterator(self, _iter, wrap_around=False): 147 | """iterates through data and handles wrap around""" 148 | for i, idx in enumerate(_iter): 149 | if i < self.wrap_around % self.batch_size: 150 | continue 151 | if wrap_around: 152 | self.wrap_around += 1 153 | self.wrap_around %= self.batch_size 154 | yield idx 155 | 156 | def _batch(self, batch): 157 | """extracts samples only pertaining to this worker's batch""" 158 | if self.interleave: 159 | return batch[self.rank : self.batch_size : self.world_size] 160 | start = self.rank * self.batch_size // self.world_size 161 | end = (self.rank + 1) * self.batch_size // self.world_size 162 | return batch[start:end] 163 | -------------------------------------------------------------------------------- /megatron/neox_arguments/deepspeed_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | try: 4 | from .template import NeoXArgsTemplate 5 | except ImportError: 6 | from template import NeoXArgsTemplate 7 | 8 | 9 | @dataclass 10 | class NeoXArgsDeepspeedConfig(NeoXArgsTemplate): 11 | """ 12 | Args for deepspeed config 13 | Every argument included here will be included in deepspeed config json 14 | #TODO this list is not complete as compared to https://www.deepspeed.ai/docs/config-json/ 15 | """ 16 | 17 | deepspeed: bool = True 18 | """boolean flag to enable DeepSpeed (Always True)""" 19 | 20 | train_batch_size: int = None 21 | """ 22 | The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs. 23 | """ 24 | 25 | train_micro_batch_size_per_gpu: int = None 26 | """ 27 | Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON. 28 | """ 29 | 30 | gradient_accumulation_steps: int = 1 31 | """ 32 | Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with train_step_batch_size in the configuration JSON. 33 | """ 34 | 35 | optimizer: dict = None 36 | """ 37 | dict containing the keys type and params 38 | 39 | type: The optimizer name. DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, and OneBitLamb optimizers (See here for details) and will import other optimizers from torch. 40 | 41 | params: Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for Adam). 42 | """ 43 | 44 | scheduler: dict = None 45 | """ 46 | dict containing the keys type and params 47 | 48 | type: The scheduler name. See here (https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers. 49 | 50 | params: Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature. 51 | """ 52 | 53 | fp32_allreduce: bool = False 54 | """ 55 | During gradient averaging perform allreduce with 32 bit values 56 | """ 57 | 58 | prescale_gradients: bool = False 59 | """ 60 | Scale gradients before doing allreduce 61 | """ 62 | 63 | gradient_predivide_factor: float = 1.0 64 | """ 65 | Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs 66 | """ 67 | 68 | sparse_gradients: bool = False 69 | """ 70 | Enable sparse compression of torch.nn.Embedding gradients. 71 | """ 72 | 73 | fp16: dict = None 74 | """ 75 | Configuration for using mixed precision/FP16 training that leverages NVIDIA’s Apex package. 76 | """ 77 | 78 | amp: dict = None 79 | """ 80 | Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options 81 | """ 82 | 83 | gradient_clipping: float = 0.0 84 | """ 85 | Enable gradient clipping with provided value 86 | """ 87 | 88 | zero_optimization: dict = None 89 | """""" 90 | 91 | steps_per_print: int = 10 92 | """ 93 | Print train loss every N steps. 94 | """ 95 | 96 | wall_clock_breakdown: bool = False 97 | """ 98 | Enable timing of the latency of forward/backward/update training phases. 99 | """ 100 | 101 | dump_state: bool = False 102 | """ 103 | Print out state information of DeepSpeed object after initialization. 104 | """ 105 | 106 | flops_profiler: dict = None 107 | """ 108 | Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#flops-profiler 109 | """ 110 | 111 | zero_allow_untested_optimizer: bool = False 112 | """ 113 | Whether Deepspeed Zero Optimizer will allow an optimizer that hasn't been tested by the deepspeed team 114 | """ 115 | 116 | 117 | @dataclass 118 | class NeoXArgsDeepspeedRunner(NeoXArgsTemplate): 119 | """ 120 | Args for deepspeed runner (deepspeed.launcher.runner). 121 | Every argument included here will be passed as command line argument to deepspeed.launcher.runner 122 | """ 123 | 124 | hostfile: str = None 125 | """ 126 | list of hostnames / ssh aliases and the number of GPUs per host 127 | 128 | example file contents: 129 | worker-1 slots=4 130 | worker-2 slots=4 131 | 127.0.0 slots=4 132 | 127.0.1 slots=4 133 | """ 134 | 135 | include: str = None 136 | """ 137 | Specify hardware resources to use during execution. String format is `NODE_SPEC[@NODE_SPEC ...]` where `NODE_SPEC=NAME[:SLOT[,SLOT ...]]`. If `:SLOT` is omitted, include all slots on that host. Example: `"worker-0@worker-1:0,2"` will use all slots. on `worker-0` and slots `[0, 2]` on `worker-1`. 138 | """ 139 | 140 | exclude: str = None 141 | """ 142 | Specify hardware resources to NOT use during execution. Same format as include 143 | """ 144 | 145 | num_nodes: int = -1 146 | """ 147 | Total number of worker nodes to run on, this will use the top N hosts from the given hostfile. -1 will use all. 148 | """ 149 | 150 | num_gpus: int = None 151 | """ 152 | Max number of GPUs to use on each node, will use [0:N) GPU ids on each node. None / not specifying a value will use all. 153 | """ 154 | 155 | master_port: int = 29500 156 | """ 157 | Port used by PyTorch distributed for communication during training. 158 | """ 159 | 160 | master_addr: str = None 161 | """ 162 | IP address of node 0, will be inferred via 'hostname -I' if not specified. 163 | """ 164 | 165 | launcher: str = "pdsh" 166 | """ 167 | Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH. 168 | """ 169 | 170 | detect_nvlink_pairs: bool = False 171 | """ 172 | If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2. 173 | """ 174 | --------------------------------------------------------------------------------