├── tests ├── __init__.py ├── unit │ ├── __init__.py │ ├── test_dependencies.py │ ├── test_tokenizer.py │ ├── test_url_accessibility.py │ ├── test_format_conversion_scripts.py │ └── test_arguments.py ├── data │ └── sample_prompt.txt ├── neox_args │ ├── __init__.py │ ├── test_neoxargs_implementation.py │ └── test_neoxargs_usage.py ├── cpu_tests │ └── docker-compose.yml ├── model │ └── __init__.py ├── pytest.ini └── config │ └── test_setup.yml ├── tools ├── __init__.py ├── bash │ ├── kill.sh │ ├── killall.sh │ ├── README.md │ ├── sync_cmd.sh │ ├── sync.sh │ └── syncdir.sh ├── README.md ├── datasets │ ├── dataset_token_count.py │ ├── multinode_prepare_data.sh │ └── merge_datasets.py └── ckpts │ └── upload.py ├── megatron ├── model │ ├── rwkv │ │ ├── __init__.py │ │ └── v6 │ │ │ ├── __init__.py │ │ │ └── cuda │ │ │ └── wkv6_op.cpp │ ├── mamba │ │ └── __init__.py │ ├── megablocks_utils.py │ ├── __init__.py │ └── fused_bias_dropout.py ├── data │ ├── __init__.py │ ├── Makefile │ └── blendable_dataset.py ├── gradient_noise_scale │ └── __init__.py ├── tokenizer │ └── __init__.py ├── fused_kernels │ ├── compat.h │ └── scaled_upper_triang_masked_softmax.cpp ├── __init__.py ├── devutil.py ├── mpu │ ├── random.py │ └── __init__.py └── neox_arguments │ └── template.py ├── .dockerignore ├── requirements ├── requirements-apex-pip.txt ├── requirements-comet.txt ├── requirements-wandb.txt ├── requirements-sparseattention.txt ├── requirements-flashattention.txt ├── requirements-onebitadam.txt ├── requirements-s3.txt ├── requirements-tensorboard.txt ├── requirements-transformerengine.txt ├── requirements-mamba.txt ├── requirements-dev.txt └── requirements.txt ├── MANIFEST.in ├── images ├── memory_profiling.png ├── nsight_profiling.png └── pytorch_profiling.png ├── .idea ├── misc.xml ├── vcs.xml ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── DAPE.iml ├── configs ├── cpu_mock_config.yml ├── slurm_local.json ├── slurm_local.yml ├── prof.yml ├── docker │ └── pythia-paths.yml ├── text_generation.yml ├── sparse.yml ├── llama │ ├── 13B.yml │ ├── 30B.yml │ ├── 65B.yml │ ├── 7B.yml │ ├── README.md │ └── train_config.yml ├── llama2 │ ├── 13B.yml │ ├── 7B.yml │ ├── 70B.yml │ ├── codellama_7B.yml │ └── codellama_34B.yml ├── local_setup.yml ├── eleutherai_cluster.yml ├── local_setup_wandb.yml ├── mistral │ └── 7B.yml ├── local_setup_comet.yml ├── slurm_125M.yml ├── 125M-json.yml ├── autotuning_configs │ ├── tune_6-7B.json │ ├── tune.json │ ├── small_tune.json │ └── tune_1-3B.json ├── gmlp_small.yml ├── pythia │ ├── 70M.yml │ ├── 160M.yml │ ├── 1-4B.yml │ ├── 410M.yml │ ├── 6-9B.yml │ ├── 12B.yml │ ├── 1B.yml │ ├── 2-8B.yml │ ├── 31M.yml │ └── 14M.yml ├── 800M.yml ├── finetuning_configs │ └── 6-9B.yml ├── bf16_125M.yml ├── mamba │ ├── mamba-130M.yml │ ├── mamba-370M.yml │ ├── mamba-1.4B.yml │ ├── mamba-2.8B.yml │ └── mamba-790M.yml ├── 49M.yml ├── bnb_125M.yml ├── 19M.yml ├── 175B.yml ├── 350M.yml ├── 1-3B.yml ├── 2-7B.yml ├── 6-7B.yml ├── 13B.yml ├── 760M.yml ├── 125M.yml ├── 125M │ └── 512 │ │ ├── 125M_cope.yml │ │ ├── 125M_fire.yml │ │ ├── 125M_alibi.yml │ │ ├── 125M_alibi_c.yml │ │ ├── 125M_kerple.yml │ │ ├── 125M_fire_c.yml │ │ ├── 125M.yml │ │ └── 125M_fire_capev2.yml ├── rwkv │ └── 170M.yml ├── 125M-moe.yml └── 125M-dmoe.yml ├── docker-compose-dockerhub.yml ├── eval_tasks └── __init__.py ├── docker-compose.yml ├── deepy.py ├── train.py ├── .pre-commit-config.yaml ├── README-MUP.md ├── post-training ├── llama_data.py └── recreating_zephyr_dpo.md ├── prepare_data.py └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /megatron/model/rwkv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | 20B_checkpoints/ 2 | -------------------------------------------------------------------------------- /tools/bash/kill.sh: -------------------------------------------------------------------------------- 1 | pkill -9 python 2 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /requirements/requirements-apex-pip.txt: -------------------------------------------------------------------------------- 1 | pip==23.3.2 2 | -------------------------------------------------------------------------------- /requirements/requirements-comet.txt: -------------------------------------------------------------------------------- 1 | comet_ml>=3.45.0 2 | -------------------------------------------------------------------------------- /requirements/requirements-wandb.txt: -------------------------------------------------------------------------------- 1 | wandb>=0.10.28 2 | -------------------------------------------------------------------------------- /requirements/requirements-sparseattention.txt: -------------------------------------------------------------------------------- 1 | triton==2.1.0 2 | -------------------------------------------------------------------------------- /tests/data/sample_prompt.txt: -------------------------------------------------------------------------------- 1 | Hello, I'm a language model 2 | -------------------------------------------------------------------------------- /requirements/requirements-flashattention.txt: -------------------------------------------------------------------------------- 1 | flash-attn==2.5.6 2 | -------------------------------------------------------------------------------- /requirements/requirements-onebitadam.txt: -------------------------------------------------------------------------------- 1 | cupy-cuda111>=8.6.0 2 | -------------------------------------------------------------------------------- /requirements/requirements-s3.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | hf-transfer>=0.1.3 3 | -------------------------------------------------------------------------------- /requirements/requirements-tensorboard.txt: -------------------------------------------------------------------------------- 1 | tensorboard==2.13.0 2 | -------------------------------------------------------------------------------- /tools/bash/killall.sh: -------------------------------------------------------------------------------- 1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py' 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /megatron/gradient_noise_scale/__init__.py: -------------------------------------------------------------------------------- 1 | from .gradient_noise_scale import GradientNoiseScale 2 | -------------------------------------------------------------------------------- /megatron/model/rwkv/v6/__init__.py: -------------------------------------------------------------------------------- 1 | from .rwkv import RWKVResidualLayerPipe, RWKVResidualLayer 2 | -------------------------------------------------------------------------------- /images/memory_profiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/memory_profiling.png -------------------------------------------------------------------------------- /images/nsight_profiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/nsight_profiling.png -------------------------------------------------------------------------------- /images/pytorch_profiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/pytorch_profiling.png -------------------------------------------------------------------------------- /requirements/requirements-transformerengine.txt: -------------------------------------------------------------------------------- 1 | pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable 2 | -------------------------------------------------------------------------------- /tests/neox_args/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | testing of implementation of command line arguments and configuration (NeoXArgs) 3 | """ 4 | -------------------------------------------------------------------------------- /megatron/model/mamba/__init__.py: -------------------------------------------------------------------------------- 1 | from .mamba import ( 2 | ParallelMambaResidualLayer, 3 | ParallelMambaResidualLayerPipe, 4 | ) 5 | -------------------------------------------------------------------------------- /requirements/requirements-mamba.txt: -------------------------------------------------------------------------------- 1 | causal_conv1d>=1.1.0 2 | einops 3 | mamba_ssm>=1.2.0.post1 # required for untied embedding + unembedding layers 4 | -------------------------------------------------------------------------------- /requirements/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8>=1.5.6 2 | clang-format>=13.0.1 3 | pre-commit>=2.17.0 4 | pytest>=6.2.3 5 | pytest-cov>=2.11.1 6 | pytest-forked>=1.3.0 7 | pytest-html==4.1.1 8 | pytest-xdist 9 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /configs/cpu_mock_config.yml: -------------------------------------------------------------------------------- 1 | # CPU unit tests should be independent of the presence of GPUs on the test server 2 | # host. This configuration mocks these GPU resources and other dependencies. 3 | { 4 | "global_num_gpus": 1 5 | } 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /tests/unit/test_dependencies.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from megatron import fused_kernels 3 | 4 | 5 | def test_fused_kernels(): 6 | pytest.xfail(reason="Fused kernels require manual intervention to install") 7 | fused_kernels.load_fused_kernels() 8 | -------------------------------------------------------------------------------- /configs/slurm_local.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab-file": "data/gpt2-vocab.json", 3 | "merge-file": "data/gpt2-merges.txt", 4 | "save": "checkpoints", 5 | "checkpoint_validation_with_forward_pass": false, 6 | "tensorboard-dir": "tensorboard", 7 | "log-dir": "logs" 8 | } 9 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /configs/slurm_local.yml: -------------------------------------------------------------------------------- 1 | { 2 | "data_path": "data/enwik8/enwik8_text_document", 3 | "vocab_file": "data/gpt2-vocab.json", 4 | "merge_file": "data/gpt2-merges.txt", 5 | "save": "checkpoints", 6 | "checkpoint_validation_with_forward_pass": false, 7 | "tensorboard_dir": "tensorboard", 8 | "log_dir": "logs", 9 | } 10 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /tests/unit/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from megatron.tokenizer import train_tokenizer 3 | 4 | 5 | @pytest.mark.cpu 6 | def test_train_tokenizer(): 7 | input_args = [ 8 | "--json_input_dir", 9 | "./tests/data/enwik8_first100.txt", 10 | "--tokenizer_output_path", 11 | "", 12 | ] 13 | args = train_tokenizer.parse_args(input_args) 14 | train_tokenizer.main(args) 15 | -------------------------------------------------------------------------------- /configs/prof.yml: -------------------------------------------------------------------------------- 1 | # Sample profiling config 2 | { 3 | # Turns on nsys and pytorch profiling 4 | "profile": true, 5 | 6 | # pytorch profiler options 7 | "profile_step_start": 10, 8 | "profile_step_stop": 12, 9 | 10 | # pytorch memory profiler options 11 | "memory_profiling": true, 12 | "memory_profiling_path": tensorboard, 13 | 14 | 15 | # All trace files (pytorch, nsys, tensorboard, etc) will be written here 16 | "tensorboard_dir": "tensorboard", 17 | } 18 | -------------------------------------------------------------------------------- /.idea/DAPE.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed 2 | ftfy>=6.0.1 3 | huggingface_hub>=0.11.0 4 | jinja2==3.1.4 5 | lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 6 | lm_eval>=0.4.0,<=0.4.1 7 | mpi4py>=3.0.3 8 | numpy<2.0 9 | pybind11>=2.6.2 10 | regex 11 | sentencepiece 12 | six 13 | tiktoken>=0.1.2 14 | tokenizers>=0.12.1 15 | transformers==4.38.0 16 | -------------------------------------------------------------------------------- /tools/bash/README.md: -------------------------------------------------------------------------------- 1 | # Bash Scripts 2 | Useful for running distributed per-node scripts on e.g. Kubernetes 3 | 4 | * `kill.sh` kills all python processes 5 | * `killall.sh` uses pdsh to kill all `train.py` processes on the nodes listed in `/job/hosts/` 6 | * `sync_cmd.sh` uses pdsh to run a command on all the nodes listed in `/job/hosts/` 7 | * `sync.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/` 8 | * `syncdir.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/` 9 | -------------------------------------------------------------------------------- /configs/docker/pythia-paths.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"], 3 | "valid-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"], 4 | "test-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"], 5 | 6 | "tokenizer-type": "HFTokenizer", 7 | "vocab-file": "/home/mchorse/data/tokenizers/20B_tokenizer.json", 8 | 9 | "save": "/home/mchorse/chk/", 10 | "load": "/home/mchorse/chk/", 11 | "checkpoint_validation_with_forward_pass": False 12 | } 13 | -------------------------------------------------------------------------------- /configs/text_generation.yml: -------------------------------------------------------------------------------- 1 | # Parameters used for text generation 2 | # Make sure `load` is specified somewhere else 3 | { 4 | # Text gen type: `input-file`, `unconditional` or `interactive` 5 | "text_gen_type": "unconditional", 6 | 7 | # Params for all 8 | "maximum_tokens": 102, 9 | "prompt_end": "\n", 10 | "temperature": 1.0, 11 | "top_p": 0.0, 12 | "top_k": 0, 13 | "recompute": false, 14 | 15 | # `unconditional`: samples 16 | "num_samples": 10, 17 | 18 | # input/output file 19 | "sample_input_file": "sample_input.txt", 20 | "sample_output_file": "sample_output.txt", 21 | } 22 | -------------------------------------------------------------------------------- /configs/sparse.yml: -------------------------------------------------------------------------------- 1 | # Add this to your config for sparse attention every other layer 2 | { 3 | "attention_config": [[["local", "global"], "all"]], 4 | 5 | # sparsity config: 6 | # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for 7 | # illustrative purposes) 8 | # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for 9 | # more detailed config instructions and available parameters 10 | 11 | "sparsity_config": { 12 | "block": 16, # block size 13 | "num_local_blocks": 32, 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/cpu_tests/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' # slightly different to make sure CPU tests run without nvidia device 2 | services: 3 | gpt-neox: 4 | command: nvidia-smi dmon 5 | image: gpt-neox 6 | build: 7 | context: . 8 | dockerfile: Dockerfile 9 | shm_size: 1g 10 | ulimits: 11 | memlock: 12 | soft: -1 13 | hard: -1 14 | logging: 15 | options: 16 | max-size: "100m" 17 | max-file: "3" 18 | volumes: 19 | - ${NEOX_DATA_PATH}:/home/mchorse/data 20 | - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk 21 | - .:/home/mchorse/gpt-neox 22 | -------------------------------------------------------------------------------- /tests/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /docker-compose-dockerhub.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | gpt-neox: 4 | command: nvidia-smi dmon 5 | image: leogao2/gpt-neox:main 6 | shm_size: 1g 7 | ulimits: 8 | memlock: 9 | soft: -1 10 | hard: -1 11 | runtime: nvidia 12 | deploy: 13 | resources: 14 | reservations: 15 | devices: 16 | - driver: nvidia 17 | capabilities: [gpu] 18 | logging: 19 | options: 20 | max-size: "100m" 21 | max-file: "3" 22 | volumes: 23 | - ${NEOX_DATA_PATH}:/home/mchorse/data 24 | - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk 25 | - .:/home/mchorse/gpt-neox 26 | -------------------------------------------------------------------------------- /eval_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness 16 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | gpt-neox: 4 | command: nvidia-smi dmon 5 | image: gpt-neox 6 | build: 7 | context: . 8 | dockerfile: Dockerfile 9 | shm_size: 1g 10 | ulimits: 11 | memlock: 12 | soft: -1 13 | hard: -1 14 | runtime: nvidia 15 | deploy: 16 | resources: 17 | reservations: 18 | devices: 19 | - driver: nvidia 20 | capabilities: [gpu] 21 | logging: 22 | options: 23 | max-size: "100m" 24 | max-file: "3" 25 | volumes: 26 | - ${NEOX_DATA_PATH}:/home/mchorse/data 27 | - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk 28 | - .:/home/mchorse/gpt-neox 29 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .tokenizer import build_tokenizer 17 | -------------------------------------------------------------------------------- /configs/llama/13B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 2, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 40, 8 | "hidden_size": 5120, 9 | "num_attention_heads": 40, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "activation": "swiglu", 25 | "mlp_multiple_of": 256, 26 | } 27 | -------------------------------------------------------------------------------- /configs/llama/30B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 4, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 60, 8 | "hidden_size": 6656, 9 | "num_attention_heads": 52, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "activation": "swiglu", 25 | "mlp_multiple_of": 256, 26 | } 27 | -------------------------------------------------------------------------------- /configs/llama/65B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 8, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 80, 8 | "hidden_size": 8192, 9 | "num_attention_heads": 64, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "activation": "swiglu", 25 | "mlp_multiple_of": 256, 26 | } 27 | -------------------------------------------------------------------------------- /configs/llama/7B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 32, 8 | "hidden_size": 4096, 9 | "num_attention_heads": 32, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "activation": "swiglu", 25 | "mlp_multiple_of": 256, 26 | } 27 | -------------------------------------------------------------------------------- /configs/llama2/13B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 2, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 40, 8 | "hidden_size": 5120, 9 | "num_attention_heads": 40, 10 | "seq_length": 4096, 11 | "max_position_embeddings": 4096, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-5, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "activation": "swiglu", 25 | "mlp_multiple_of": 256, 26 | } 27 | -------------------------------------------------------------------------------- /configs/llama2/7B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 32, 8 | "hidden_size": 4096, 9 | "num_attention_heads": 32, 10 | "seq_length": 4096, 11 | "max_position_embeddings": 4096, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-5, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "activation": "swiglu", 25 | "mlp_multiple_of": 256, 26 | } 27 | -------------------------------------------------------------------------------- /configs/llama/README.md: -------------------------------------------------------------------------------- 1 | # LLaMA 2 | 3 | ## Training and Finetuning 4 | 5 | These configs contain the architecture settings required to run inference/training/finetuning on the [LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama) model suite. 6 | 7 | LLaMA finetuning can be launched with 8 | ```sh 9 | python ./deepy.py ./train.py -d configs llama/7B.yml llama/train_config.yml local_setup.yml 10 | ``` 11 | 12 | If training from scratch, set `finetune=False` in `./configs/llama/train_config.yml`. 13 | 14 | 15 | ## Inference 16 | 17 | 18 | LLaMA generation can be launched with 19 | ```sh 20 | python ./deepy.py ./generate.py -d configs \ 21 | llama/7B.yml llama/train_config.yml local_setup.yml text_generation.yml \ 22 | -i input_prompt.txt -o prompt_out.txt 23 | ``` 24 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # GPT-NeoX Auxiliary Tools 2 | 3 | This directory contains a number of auxiliary tools that are useful for working with GPT-NeoX but not part of the main training code. 4 | 5 | ## Bash 6 | 7 | This directory contains some simple, frequently used bash commands to make working on multiple machines easier. 8 | 9 | ## Checkpoints 10 | 11 | This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x. 12 | 13 | ## Datasets 14 | 15 | This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library. 16 | -------------------------------------------------------------------------------- /tests/unit/test_url_accessibility.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import requests 3 | 4 | from tools.datasets.corpora import DATA_DOWNLOADERS 5 | 6 | 7 | def check_url_accessible(url): 8 | try: 9 | response = requests.head(url, timeout=5) 10 | response.raise_for_status() 11 | return True 12 | except requests.exceptions.RequestException as e: 13 | print(f"Error: Unable to access URL - {e}") 14 | return False 15 | 16 | 17 | @pytest.mark.cpu 18 | @pytest.mark.parametrize("dataset_name", list(DATA_DOWNLOADERS.keys())) 19 | def test_url_accessibility(dataset_name): 20 | if dataset_name == "pass": 21 | return 22 | elif not dataset_name == "enwik8": 23 | pytest.xfail() 24 | for url in DATA_DOWNLOADERS[dataset_name].urls: 25 | assert check_url_accessible(url) 26 | -------------------------------------------------------------------------------- /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [pytest] 16 | markers = 17 | cpu: marks tests that can be run on cpu 18 | filterwarnings = 19 | ignore::DeprecationWarning:pkg_resources.* 20 | ignore::DeprecationWarning:torch.* 21 | -------------------------------------------------------------------------------- /tools/bash/sync_cmd.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Runs a command in parallel across all nodes 18 | # Usage 19 | # sync_cmd.sh 'echo "hello world"' 20 | 21 | echo "Command: $1"; 22 | pdsh -R ssh -w ^/job/hosts $1 23 | -------------------------------------------------------------------------------- /configs/llama2/70B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 80, 8 | "hidden_size": 8192, 9 | "intermediate_size": 28672, 10 | "num_attention_heads": 64, 11 | "num_kv_heads": 8, 12 | "seq_length": 4096, 13 | "max_position_embeddings": 4096, 14 | "pos_emb": "rotary", 15 | "rotary_pct": 1, 16 | "rotary_emb_base": 1000000, 17 | "no_weight_tying": true, 18 | "gpt_j_residual": false, 19 | "output_layer_parallelism": "column", 20 | "norm": "rmsnorm", 21 | "rms_norm_epsilon": 1.0e-5, 22 | 23 | "attention_config": [[["flash"], 80]], 24 | 25 | "scaled_upper_triang_masked_softmax_fusion": true, 26 | "bias_gelu_fusion": false, 27 | "use_bias_in_norms": false, 28 | "use_bias_in_attn_linear": false, 29 | "activation": "swiglu", 30 | "mlp_multiple_of": 256, 31 | } 32 | -------------------------------------------------------------------------------- /configs/llama2/codellama_7B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 32, 8 | "hidden_size": 4096, 9 | "num_attention_heads": 32, 10 | # Codellama was uptrained on 16k token sequence lengths 11 | # with rotary_emb_base adjusted to 1_000_000. 12 | "seq_length": 16384, 13 | "max_position_embeddings": 16384, 14 | "pos_emb": "rotary", 15 | "rotary_pct": 1, 16 | "rotary_emb_base": 1000000, 17 | "no_weight_tying": true, 18 | "gpt_j_residual": false, 19 | "output_layer_parallelism": "column", 20 | "norm": "rmsnorm", 21 | "rms_norm_epsilon": 1.0e-5, 22 | 23 | "attention_config": [[["flash"], 32]], 24 | 25 | "scaled_upper_triang_masked_softmax_fusion": true, 26 | "bias_gelu_fusion": false, 27 | "use_bias_in_norms": false, 28 | "use_bias_in_attn_linear": false, 29 | "activation": "swiglu", 30 | "mlp_multiple_of": 256, 31 | } 32 | -------------------------------------------------------------------------------- /tools/bash/sync.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # sync.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | echo Uploading $full_path 27 | pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path 28 | done 29 | -------------------------------------------------------------------------------- /configs/llama2/codellama_34B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 48, 8 | "hidden_size": 8192, 9 | "num_attention_heads": 64, 10 | "num_kv_heads": 8, 11 | # Codellama was uptrained on 16k token sequence lengths 12 | # with rotary_emb_base adjusted to 1_000_000. 13 | "seq_length": 16384, 14 | "max_position_embeddings": 16384, 15 | "pos_emb": "rotary", 16 | "rotary_pct": 1, 17 | "rotary_emb_base": 1000000, 18 | "no_weight_tying": true, 19 | "gpt_j_residual": false, 20 | "output_layer_parallelism": "column", 21 | "norm": "rmsnorm", 22 | "rms_norm_epsilon": 1.0e-5, 23 | 24 | "attention_config": [[["flash"], 48]], 25 | 26 | "scaled_upper_triang_masked_softmax_fusion": true, 27 | "bias_gelu_fusion": false, 28 | "use_bias_in_norms": false, 29 | "use_bias_in_attn_linear": false, 30 | "activation": "swiglu", 31 | "mlp_multiple_of": 256, 32 | } 33 | -------------------------------------------------------------------------------- /megatron/model/megablocks_utils.py: -------------------------------------------------------------------------------- 1 | """Adapter to expose MegaBlocks package, if available.""" 2 | 3 | try: 4 | import megablocks 5 | except ImportError: 6 | megablocks = None 7 | 8 | 9 | def megablocks_is_available(): 10 | return megablocks is not None 11 | 12 | 13 | def assert_megablocks_is_available(): 14 | assert ( 15 | megablocks_is_available() 16 | ), "MegaBlocks not available. Please run `pip install megablocks`." 17 | 18 | 19 | moe = megablocks.layers.moe if megablocks_is_available() else None 20 | dmoe = megablocks.layers.dmoe if megablocks_is_available() else None 21 | arguments = megablocks.layers.arguments if megablocks_is_available() else None 22 | 23 | 24 | def as_megablocks_args(neox_args): 25 | import copy 26 | 27 | tmp = copy.copy(neox_args) 28 | args = arguments.from_megatron(tmp) 29 | args.moe_lbl_in_fp32 = True 30 | args.fp16 = neox_args.precision == "fp16" 31 | args.moe_loss_weight = neox_args.moe_loss_coeff 32 | return args 33 | -------------------------------------------------------------------------------- /tools/datasets/dataset_token_count.py: -------------------------------------------------------------------------------- 1 | # Script counts tokens in a pretokenized dataset from preprocess_data.py 2 | # Necessary for setting batch size, train_iters, etc 3 | 4 | import sys 5 | import os 6 | 7 | ## Necessary for the import 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) 9 | sys.path.insert(0, project_root) 10 | 11 | from megatron.data import indexed_dataset 12 | import numpy as np 13 | 14 | if len(sys.argv) < 2: 15 | print( 16 | "Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..." 17 | ) 18 | sys.exit(1) 19 | 20 | # Access the command-line arguments 21 | arguments = sys.argv[1:] 22 | 23 | for arg in arguments: 24 | print("Checking file", arg) 25 | try: 26 | dataset = indexed_dataset.make_dataset(arg, "mmap") 27 | size = np.sum(dataset.sizes) 28 | print("Dataset size in tokens is", size) 29 | except AttributeError: 30 | print("Dataset could not be loaded", arg) 31 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied from NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /tools/bash/syncdir.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # syncdir.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | parentdir="$(dirname "$full_path")" 27 | echo Uploading $full_path to $parentdir 28 | pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir 29 | done 30 | -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | 17 | def print_rank_0(*message): 18 | """If distributed is initialized print only on rank 0.""" 19 | if torch.distributed.is_initialized(): 20 | if torch.distributed.get_rank() == 0: 21 | print(*message, flush=True) 22 | else: 23 | print(*message, flush=True) 24 | 25 | 26 | from .neox_arguments import NeoXArgs 27 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2024 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from .gpt2_model import GPT2ModelPipe 19 | from .utils import ( 20 | get_params_for_weight_decay_optimization, 21 | mark_norms_for_sequence_parallel_grad_sync, 22 | ) 23 | from .word_embeddings import SoftEmbedding 24 | -------------------------------------------------------------------------------- /tests/unit/test_format_conversion_scripts.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from tools.ckpts import convert_neox_to_hf 3 | from tests.common import simulate_deepy_env, save_random_model 4 | from megatron.neox_arguments.neox_args import NeoXArgsTokenizer 5 | 6 | 7 | def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path): 8 | # Generate random GPT-NEOX model, check we can convert to hf format 9 | model_dir = str(tmpdir) 10 | input_args = ["train.py", "tests/config/test_setup.yml"] 11 | deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args) 12 | save_random_model(deepspeed_main_args, model_dir, train_iters=1) 13 | 14 | # Generate output 15 | script_args = [ 16 | "--config_file", 17 | "tests/config/test_setup.yml", 18 | "--input_dir", 19 | model_dir + "/global_step1", 20 | "--output_dir", 21 | model_dir, 22 | ] 23 | overwrite_values = {"tokenizer_type": NeoXArgsTokenizer.tokenizer_type} 24 | convert_neox_to_hf.main(input_args=script_args, overwrite_values=overwrite_values) 25 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_implementation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | check implementation of NeoXArgs for duplication errors (would overwrite) 17 | """ 18 | import pytest 19 | 20 | 21 | @pytest.mark.cpu 22 | def test_neoxargs_duplicates(): 23 | """ 24 | tests that there are no duplicates among parent classes of NeoXArgs 25 | """ 26 | from megatron import NeoXArgs 27 | 28 | assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates" 29 | -------------------------------------------------------------------------------- /configs/local_setup.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data_path": "data/enwik8/enwik8_text_document", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 7 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 8 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab_file": "data/gpt2-vocab.json", 19 | "merge_file": "data/gpt2-merges.txt", 20 | 21 | "save": "checkpoints", 22 | "load": "checkpoints", 23 | "checkpoint_validation_with_forward_pass": False, 24 | 25 | "tensorboard_dir": "tensorboard", 26 | "log_dir": "logs", 27 | } 28 | -------------------------------------------------------------------------------- /configs/eleutherai_cluster.yml: -------------------------------------------------------------------------------- 1 | # Data paths and options when using EleutherAI cluster 2 | { 3 | # you may include multiple distinct datasets if desired 4 | "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"], 5 | "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"], 6 | "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"], 7 | 8 | # if using multiple datasets, provide weights for them to be sampled with 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | 14 | # If you would like the code to create val and test datasets from your training set use the following instead 15 | # "split" determines the relative size of train, val, and test 16 | 17 | # "split" 995,4,1 18 | # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document", 19 | 20 | "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json", 21 | "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt", 22 | "save": "/mnt/ssd-1/checkpoints", 23 | "load": "/mnt/ssd-1/checkpoints", 24 | "tensorboard_dir": "/mnt/ssd-1/tensorboard", 25 | "log_dir": "/mnt/ssd-1/logs", 26 | "wandb_team": "eleutherai", 27 | "wandb_project": "neox", 28 | "wandb_group": "example" 29 | } 30 | -------------------------------------------------------------------------------- /configs/local_setup_wandb.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data_path": "data/enwik8/enwik8_text_document", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 7 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 8 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab_file": "data/gpt2-vocab.json", 19 | "merge_file": "data/gpt2-merges.txt", 20 | 21 | "save": "checkpoints", 22 | "load": "checkpoints", 23 | "checkpoint_validation_with_forward_pass": False, 24 | 25 | "tensorboard_dir": "tensorboard", 26 | "log_dir": "logs", 27 | "use_wandb": True, 28 | "wandb_host": "https://api.wandb.ai", 29 | "wandb_project": "neox" 30 | } 31 | -------------------------------------------------------------------------------- /megatron/devutil.py: -------------------------------------------------------------------------------- 1 | import torch.cuda 2 | 3 | 4 | class Metric: 5 | """ 6 | Dumb utility to collect and report average wall-time metrics. 7 | """ 8 | 9 | def __init__(self, label): 10 | self.label = label 11 | self.measurements = [] 12 | 13 | def collect(self, measurement): 14 | self.measurements.append(measurement) 15 | 16 | def get_measurements(self): 17 | return self.measurements[:] 18 | 19 | def report(self): 20 | print( 21 | self.label, 22 | torch.quantile(torch.tensor(self.measurements), torch.arange(10) / 10.0), 23 | ) 24 | 25 | 26 | def monitor_method_cuda_wall_times(metric, obj, methodname): 27 | """ 28 | Measure timings for a method on an object or class. 29 | 30 | For instance: 31 | 32 | >>> metric = Metric('!LNORM') 33 | >>> monitor_method_wall_times(metric, LayerNorm, 'forward') 34 | """ 35 | oldmeth = getattr(obj, methodname) 36 | 37 | start_event = torch.cuda.Event(enable_timing=True) 38 | end_event = torch.cuda.Event(enable_timing=True) 39 | 40 | def newmeth(*args, **kw): 41 | start_event.record() 42 | try: 43 | return oldmeth(*args, **kw) 44 | finally: 45 | end_event.record() 46 | torch.cuda.synchronize() 47 | elapsed = start_event.elapsed_time(end_event) 48 | metric.collect(elapsed) 49 | metric.report() 50 | 51 | setattr(obj, methodname, newmeth) 52 | -------------------------------------------------------------------------------- /configs/mistral/7B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 32, 8 | "hidden_size": 4096, 9 | "intermediate_size": 14336, 10 | "num_attention_heads": 32, 11 | "num_kv_heads": 8, 12 | # per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen 13 | # and instruction tuned to 16384 seqlen, all with 4096 sliding window 14 | "seq_length": 8192, 15 | "sliding_window_width": 4096, 16 | "max_position_embeddings": 131072, 17 | "pos_emb": "rotary", 18 | "rotary_pct": 1, 19 | "rotary_emb_base": 10000, 20 | "no_weight_tying": true, 21 | "gpt_j_residual": false, 22 | "output_layer_parallelism": "column", 23 | "norm": "rmsnorm", 24 | "rms_norm_epsilon": 1.0e-5, 25 | 26 | # Grouped Query Attention is supported for both default ("global") 27 | # and Flash attention. However, we highly recommend the use of Flash attention 28 | # to get FLOP + runtime speedups when using GQA, 29 | # and sliding window attention is currently only supported by Flash attention. 30 | "attention_config": [[["flash"], 32]], 31 | 32 | "scaled_upper_triang_masked_softmax_fusion": true, 33 | "bias_gelu_fusion": false, 34 | "use_bias_in_norms": false, 35 | "use_bias_in_attn_linear": false, 36 | "activation": "swiglu", 37 | 38 | "tokenizer_type": "SPMTokenizer", 39 | #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download 40 | 41 | } 42 | -------------------------------------------------------------------------------- /deepy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2024, EleutherAI 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | 19 | import deepspeed.launcher.runner 20 | 21 | 22 | def main(input_args=None): 23 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) 24 | 25 | from megatron.neox_arguments import NeoXArgs 26 | from megatron.utils import get_wandb_api_key 27 | 28 | neox_args = NeoXArgs.consume_deepy_args(input_args) 29 | deepspeed_main_args = neox_args.get_deepspeed_main_args() 30 | 31 | # Extract wandb API key and inject into worker environments 32 | wandb_token = get_wandb_api_key(neox_args=neox_args) 33 | if wandb_token is not None: 34 | deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY") 35 | os.environ["WANDB_API_KEY"] = wandb_token 36 | 37 | deepspeed.launcher.runner.main(deepspeed_main_args) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /configs/local_setup_comet.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data_path": "/workspace/gpt-neox-main/data/enwik8/enwik8_text_document", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 7 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 8 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab_file": "/workspace/gpt-neox-main/data/gpt2-vocab.json", 19 | "merge_file": "/workspace/gpt-neox-main/data/gpt2-merges.txt", 20 | 21 | "save": "checkpoints", 22 | "load": "checkpoints", 23 | "checkpoint_validation_with_forward_pass": False, 24 | 25 | "tensorboard_dir": "tensorboard", 26 | "log_dir": "logs", 27 | "use_comet": True, 28 | # "comet_workspace": "test_workspace", # CHANGE ME 29 | "comet_project": "test_project", 30 | "comet_experiment_name": "test_experiment", 31 | "comet_tags": ["test_tag1", "test_tag2"], 32 | "comet_others": {"test_others"}, 33 | } 34 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Train""" 19 | from megatron.neox_arguments import NeoXArgs 20 | from megatron.training import pretrain 21 | 22 | 23 | def main(input_args=None, overwrite_values=None): 24 | neox_args = NeoXArgs.consume_neox_args( 25 | input_args=input_args, overwrite_values=overwrite_values 26 | ) 27 | neox_args.configure_distributed_args() 28 | neox_args.build_tokenizer() # tokenizer needs to be build in training in order to set the padding vocab 29 | neox_args.initialize_tensorboard_writer() # is initialized if tensorboard directory is defined 30 | neox_args.initialize_comet() # is initialized if comet directory is defined 31 | pretrain(neox_args=neox_args) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-json 7 | - id: check-symlinks 8 | - id: check-yaml 9 | - id: destroyed-symlinks 10 | - id: end-of-file-fixer 11 | exclude: ^(docs/CNAME/|configs/neox_arguments.md) 12 | - id: fix-byte-order-marker 13 | - id: fix-encoding-pragma 14 | args: [--remove] 15 | - id: mixed-line-ending 16 | args: [--fix=lf] 17 | - id: requirements-txt-fixer 18 | - id: trailing-whitespace 19 | exclude: ^(docs/CNAME/|configs/neox_arguments.md) 20 | - repo: https://gitlab.com/daverona/pre-commit/cpp 21 | rev: 0.8.0 22 | hooks: 23 | - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available 24 | args: [] 25 | 26 | - repo: https://github.com/psf/black 27 | rev: 22.3.0 28 | hooks: 29 | - id: black 30 | language_version: python3 31 | - repo: https://github.com/codespell-project/codespell 32 | rev: v2.1.0 33 | hooks: 34 | - id: codespell 35 | args: [ 36 | '--ignore-words-list=reord,dout,te', # Word used in error messages that need rewording. te --> transformerengine 37 | --check-filenames, 38 | --check-hidden, 39 | ] 40 | exclude: tests/data/hf_cache/tokenizer/gpt2.json 41 | -------------------------------------------------------------------------------- /megatron/mpu/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports 16 | # TODO: should be able to get rid of this file entirely 17 | 18 | import deepspeed 19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing 20 | 21 | # Default name for the model parallel rng tracker. 22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = ( 23 | deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME 24 | ) 25 | 26 | # Whether apply model parallelsim to checkpointed hidden states. 27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None 28 | 29 | # RNG tracker object. 30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER 31 | 32 | # Deepspeed checkpointing functions 33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones 34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state 35 | checkpoint = checkpointing.checkpoint 36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed 37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker 38 | -------------------------------------------------------------------------------- /tools/ckpts/upload.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | 18 | from huggingface_hub import HfApi, create_repo 19 | 20 | converted_ckpt = sys.argv[1] 21 | repo_name = sys.argv[2] 22 | branch_name = sys.argv[3] 23 | try: 24 | create_repo(repo_name, repo_type="model", private=False) 25 | except: 26 | print("repo {repo_name} already exists!") 27 | pass 28 | 29 | files = os.listdir(converted_ckpt) 30 | 31 | api = HfApi() 32 | if branch_name != "main": 33 | try: 34 | api.create_branch( 35 | repo_id=repo_name, 36 | repo_type="model", 37 | branch=branch_name, 38 | ) 39 | except: 40 | print(f"branch {branch_name} already exists, try again...") 41 | print(f"to upload: {files}") 42 | for file in files: 43 | print(f"Uploading {file} to branch {branch_name}...") 44 | api.upload_file( 45 | path_or_fileobj=os.path.join(converted_ckpt, file), 46 | path_in_repo=file, 47 | repo_id=repo_name, 48 | repo_type="model", 49 | commit_message=f"Upload {file}", 50 | revision=branch_name, 51 | ) 52 | print(f"Successfully uploaded {file} !") 53 | -------------------------------------------------------------------------------- /README-MUP.md: -------------------------------------------------------------------------------- 1 | # How to use Mup (https://github.com/microsoft/mup) 2 | 3 | ## Add mup neox args to your config 4 | 5 | ``` 6 | # mup 7 | 8 | "use-mup": true, 9 | 10 | "save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank 11 | 12 | "base-shapes-file": "base-shapes", # load base shapes from this file 13 | 14 | "coord-check": false, # generate coord check plots to verify mup's implementation in neox 15 | 16 | # mup hp search 17 | 18 | "mup-init-scale": 1.0, 19 | 20 | "mup-attn-temp": 1.0, 21 | 22 | "mup-output-temp": 1.0, 23 | 24 | "mup-embedding-mult": 1.0, 25 | 26 | "mup-rp-embedding-mult": 1.0, 27 | ``` 28 | 29 | ## Generate base shapes 30 | 31 | 1. Set use-mup to true 32 | 2. Set save-base-shapes to true 33 | 3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named .. gpt-neox will exit immediately. 34 | 4. Set save-base-shapes to false 35 | 36 | ## Generate coord check plots (optional) 37 | 38 | 1. Keep use-mup true 39 | 2. Set coord-check to true 40 | 3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately 41 | 4. Set coord-check to false 42 | 43 | ## Tune mup hyperparameters and LR 44 | 45 | The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml). 46 | 47 | ## Transfer 48 | 49 | With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again. 50 | -------------------------------------------------------------------------------- /tests/unit/test_arguments.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from megatron.neox_arguments import NeoXArgs 16 | from tests.common import BASE_CONFIG, DistributedTest 17 | 18 | 19 | def test_main_constructor(): 20 | input_args = ["train.py", "tests/config/test_setup.yml"] 21 | neox_args = NeoXArgs.consume_deepy_args(input_args) 22 | deepspeed_main_args = neox_args.get_deepspeed_main_args() 23 | neox_args = NeoXArgs.consume_neox_args(input_args=deepspeed_main_args) 24 | neox_args.configure_distributed_args() 25 | 26 | 27 | class test_constructor_from_ymls_class(DistributedTest): 28 | world_size = 2 29 | 30 | def test(self): 31 | neox_args = NeoXArgs.from_ymls(["tests/config/test_setup.yml"]) 32 | neox_args.configure_distributed_args() 33 | 34 | 35 | def test_constructor_from_ymls(): 36 | t1 = test_constructor_from_ymls_class() 37 | t1.test() 38 | 39 | 40 | class test_constructor_from_dict_class(DistributedTest): 41 | world_size = 2 42 | 43 | def test(self): 44 | neox_args = NeoXArgs.from_dict(BASE_CONFIG) 45 | 46 | 47 | def test_constructor_from_dict(): 48 | t1 = test_constructor_from_dict_class() 49 | t1.test() 50 | -------------------------------------------------------------------------------- /post-training/llama_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_dataset, DatasetDict 4 | 5 | import jsonlines 6 | 7 | ############### 8 | # Load datasets 9 | ############### 10 | raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized") 11 | # convert to just train and test, not necessary but it looks better 12 | raw_datasets = DatasetDict( 13 | { 14 | "train": raw_datasets["train_prefs"], 15 | "test": raw_datasets["test_prefs"], 16 | } 17 | ) 18 | os.makedirs(os.path.join("data", "pairwise"), exist_ok=True) 19 | for split in ["train", "test"]: 20 | with open( 21 | os.path.join("data", "pairwise", f"llama3_dpo_{split}_filtered.jsonl"), "w" 22 | ) as f: 23 | writer = jsonlines.Writer(f) 24 | for item in raw_datasets[split]: 25 | item["chosen"] = item["chosen"] 26 | item["rejected"] = item["rejected"] 27 | writer.write(item) 28 | os.makedirs(os.path.join("data", "sft"), exist_ok=True) 29 | for split in ["train", "test"]: 30 | with open( 31 | os.path.join("data", "sft", f"llama3_sft_{split}_filtered.jsonl"), "w" 32 | ) as f: 33 | writer = jsonlines.Writer(f) 34 | for item in raw_datasets[split]: 35 | item["messages"] = item["chosen"] 36 | writer.write(item) 37 | os.makedirs(os.path.join("data", "kto"), exist_ok=True) 38 | for split in ["train", "test"]: 39 | with open( 40 | os.path.join("data", "kto", f"llama3_kto_{split}_filtered.jsonl"), "w" 41 | ) as f: 42 | writer = jsonlines.Writer(f) 43 | for item in raw_datasets[split]: 44 | item["messages"] = item["chosen"] 45 | item["reward"] = 1 46 | writer.write(item) 47 | item["messages"] = item["rejected"] 48 | item["reward"] = -1 49 | writer.write(item) 50 | -------------------------------------------------------------------------------- /megatron/neox_arguments/template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | import logging 17 | 18 | 19 | @dataclass 20 | class NeoXArgsTemplate: 21 | def defaults(self): 22 | """ 23 | generator for getting default values. 24 | """ 25 | for key, field_def in self.__dataclass_fields__.items(): 26 | yield key, field_def.default 27 | 28 | def update_value(self, key: str, value): 29 | """ 30 | updates a property value if the key already exists 31 | 32 | Problem: a previously non-existing property can be added to the class instance without error. 33 | """ 34 | if hasattr(self, key): 35 | setattr(self, key, value) 36 | else: 37 | error_message = ( 38 | self.__class__.__name__ 39 | + ".update_value() to be updated property " 40 | + str(key) 41 | + " does not exist" 42 | ) 43 | logging.error(error_message) 44 | raise ValueError(error_message) 45 | 46 | def update_values(self, d): 47 | """ 48 | Updates multiple values in self if the keys already exists 49 | """ 50 | for k, v in d.items(): 51 | self.update_value(k, v) 52 | -------------------------------------------------------------------------------- /configs/slurm_125M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "num_layers": 12, 5 | "hidden_size": 768, 6 | "num_attention_heads": 12, 7 | "seq_length": 2048, 8 | "max_position_embeddings": 2048, 9 | "norm": "layernorm", 10 | "pos_emb": "rotary", 11 | "no_weight_tying": true, 12 | "scaled_upper_triang_masked_softmax_fusion": true, 13 | "bias_gelu_fusion": true, 14 | "rope_fusion": false, 15 | "layernorm_fusion": false, 16 | "optimizer": { 17 | "type": "Adam", 18 | "params": { 19 | "lr": 0.0006, 20 | "betas": [0.9, 0.999], 21 | "eps": 1.0e-8 22 | } 23 | }, 24 | "zero_optimization": { 25 | "stage": 0, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 500000000, 28 | "overlap_comm": true, 29 | "reduce_scatter": true, 30 | "reduce_bucket_size": 500000000, 31 | "contiguous_gradients": true 32 | }, 33 | "train_micro_batch_size_per_gpu": 4, 34 | "data_impl": "mmap", 35 | "split": "949,50,1", 36 | "checkpoint_activations": true, 37 | "checkpoint_num_layers": 1, 38 | "partition_activations": true, 39 | "synchronize_each_layer": true, 40 | "gradient_clipping": 1.0, 41 | "weight_decay": 0.0, 42 | "hidden_dropout": 0.0, 43 | "attention_dropout": 0.0, 44 | "fp16": { 45 | "enabled": true, 46 | "loss_scale": 0, 47 | "loss_scale_window": 1000, 48 | "hysteresis": 2, 49 | "min_loss_scale": 1 50 | }, 51 | "train_iters": 320000, 52 | "lr_decay_iters": 320000, 53 | "distributed_backend": "nccl", 54 | "lr_decay_style": "cosine", 55 | "warmup": 0.01, 56 | "checkpoint_factor": 10000, 57 | "eval_interval": 1000, 58 | "eval_iters": 10, 59 | "log_interval": 100, 60 | "steps_per_print": 10, 61 | "keep_last_n_checkpoints": 4, 62 | "wall_clock_breakdown": true, 63 | "launcher": "slurm", 64 | "deepspeed_slurm": true, 65 | "comment": "neox" 66 | } 67 | -------------------------------------------------------------------------------- /configs/llama/train_config.yml: -------------------------------------------------------------------------------- 1 | { 2 | # finetuning option 3 | "finetune": true, 4 | 5 | # init methods 6 | "init_method": "small_init", 7 | "output_layer_init_method": "wang_init", 8 | 9 | # optimizer settings 10 | "optimizer": { 11 | "type": "Adam", 12 | "params": { 13 | "lr": 0.0002, 14 | "betas": [0.9, 0.95], 15 | "eps": 1.0e-8, 16 | } 17 | }, 18 | "min_lr": 0.00002, 19 | "override_lr_scheduler": true, 20 | 21 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 22 | "zero_optimization": { 23 | "stage": 1, 24 | "allgather_partitions": True, 25 | "allgather_bucket_size": 500000000, 26 | "overlap_comm": True, 27 | "reduce_scatter": True, 28 | "reduce_bucket_size": 500000000, 29 | "contiguous_gradients": True, 30 | }, 31 | 32 | # batch / data settings 33 | "train_micro_batch_size_per_gpu": 4, 34 | "data_impl": "mmap", 35 | 36 | # activation checkpointing 37 | "checkpoint_activations": true, 38 | "checkpoint_num_layers": 1, 39 | "partition_activations": true, 40 | "synchronize_each_layer": true, 41 | 42 | # regularization 43 | "gradient_clipping": 1.0, 44 | "weight_decay": 0.1, 45 | "hidden_dropout": 0, 46 | "attention_dropout": 0, 47 | 48 | # precision settings 49 | "fp16": { 50 | "fp16": true, 51 | "enabled": true, 52 | "loss_scale": 0, 53 | "loss_scale_window": 1000, 54 | "hysteresis": 2, 55 | "min_loss_scale": 1 56 | }, 57 | 58 | # misc. training settings 59 | "train_iters": 320000, 60 | "lr_decay_iters": 320000, 61 | "distributed_backend": "nccl", 62 | "lr_decay_style": "cosine", 63 | "warmup": 0.01, 64 | "checkpoint_factor": 10000, 65 | "eval_interval": 1000, 66 | "eval_iters": 10, 67 | 68 | # logging 69 | "log_interval": 100, 70 | "steps_per_print": 10, 71 | "keep_last_n_checkpoints": 4, 72 | "wall_clock_breakdown": true, 73 | "mlp_multiple_of": 256, 74 | } 75 | -------------------------------------------------------------------------------- /configs/125M-json.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 12, 6 | "hidden_size": 768, 7 | "num_attention_heads": 12, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos_emb": "rotary", 12 | "no_weight_tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": false, 17 | "bias_gelu_fusion": false, 18 | "rope_fusion": false, 19 | "layernorm_fusion": false, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00006, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true 42 | }, 43 | 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data_impl": "mmap", 46 | 47 | "checkpoint_activations": true, 48 | "checkpoint_num_layers": 1, 49 | "partition_activations": true, 50 | "synchronize_each_layer": true, 51 | 52 | "gradient_clipping": 1.0, 53 | "weight_decay": 0.1, 54 | "hidden_dropout": 0.0, 55 | "attention_dropout": 0.0, 56 | 57 | "fp16": { 58 | "enabled": true, 59 | "loss_scale": 0, 60 | "loss_scale_window": 1000, 61 | "hysteresis": 2, 62 | "min_loss_scale": 1 63 | }, 64 | 65 | "train_iters": 320000, 66 | "lr_decay_iters": 320000, 67 | "distributed_backend": "nccl", 68 | "lr_decay_style": "cosine", 69 | "warmup": 0.01, 70 | "checkpoint_factor": 10000, 71 | "eval_interval": 1000, 72 | "eval_iters": 10, 73 | 74 | "log_interval": 100, 75 | "steps_per_print": 10, 76 | "keep_last_n_checkpoints": 4, 77 | "wall_clock_breakdown": true, 78 | 79 | "hostfile": "/mock_path" 80 | } 81 | -------------------------------------------------------------------------------- /configs/autotuning_configs/tune_6-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 8, 4 | 5 | "num-layers": 32, 6 | "hidden-size": 4096, 7 | "num-attention-heads": 32, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | 14 | "scaled-upper-triang-masked-softmax-fusion": false, 15 | "bias-gelu-fusion": false, 16 | 17 | 18 | "optimizer": { 19 | "type": "Adam", 20 | "params": { 21 | "lr": 0.00012, 22 | "betas": [0.9, 0.999], 23 | "eps": 1.0e-8 24 | } 25 | }, 26 | 27 | "train_micro_batch_size_per_gpu": 1, 28 | "zero_optimization": { 29 | "stage": [0, 1, 2, 3] 30 | }, 31 | "data-impl": "mmap", 32 | "split": "949,50,1", 33 | 34 | "checkpoint-activations": true, 35 | "checkpoint-num-layers": 1, 36 | "partition-activations": true, 37 | "synchronize-each-layer": true, 38 | 39 | "gradient_clipping": 1.0, 40 | "weight-decay": 0, 41 | "hidden-dropout": 0, 42 | "attention-dropout": 0, 43 | 44 | "fp16": { 45 | "fp16": true, 46 | "enabled": true, 47 | "loss_scale": 0, 48 | "loss_scale_window": 1000, 49 | "hysteresis": 2, 50 | "min_loss_scale": 1 51 | }, 52 | 53 | "train-iters": 100, 54 | "lr-decay-iters": 320000, 55 | "distributed-backend": "nccl", 56 | "lr-decay-style": "cosine", 57 | "warmup": 0.01, 58 | "checkpoint-factor": 10000, 59 | "eval-interval": 1000, 60 | "eval-iters": 10, 61 | "log-interval": 100, 62 | "steps_per_print": 10, 63 | "keep-last-n-checkpoints": 4, 64 | "wall_clock_breakdown": true, 65 | "launcher": "slurm", 66 | "deepspeed_slurm": true, 67 | "no_ssh_check": true, 68 | "comment": "neox", 69 | "autotuning": { 70 | "enabled": true, 71 | "mp_size": 8, 72 | "arg_mappings": { 73 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 74 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /configs/gmlp_small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | "attention_config": [[["gmlp"], "all"]], 8 | 9 | 10 | # model settings 11 | "num_layers": 12, 12 | "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4 13 | "gmlp_attn_dim": 64, 14 | "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention. 15 | "seq_length": 2048, 16 | "max_position_embeddings": 2048, 17 | "norm": "layernorm", 18 | "pos_emb": "none", 19 | "no_weight_tying": true, 20 | 21 | # optimizer settings 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0006, 26 | "betas": [0.9, 0.999], 27 | "eps": 1.0e_8, 28 | } 29 | }, 30 | 31 | # batch / data settings 32 | "train_micro_batch_size_per_gpu": 4, 33 | "data_impl": "mmap", 34 | "split": "949,50,1", 35 | 36 | # activation checkpointing 37 | "checkpoint_activations": true, 38 | "checkpoint_num_layers": 1, 39 | "partition_activations": false, 40 | "synchronize_each_layer": true, 41 | 42 | # regularization 43 | "gradient_clipping": 1.0, 44 | "weight_decay": 0.1, 45 | "hidden_dropout": 0.0, 46 | "attention_dropout": 0.0, 47 | 48 | # precision settings 49 | "fp16": { 50 | "enabled": true, 51 | "loss_scale": 0, 52 | "loss_scale_window": 1000, 53 | "hysteresis": 2, 54 | "min_loss_scale": 1 55 | }, 56 | 57 | # misc. training settings 58 | "train_iters": 320000, 59 | "lr_decay_iters": 320000, 60 | "distributed_backend": "nccl", 61 | "lr_decay_style": "cosine", 62 | "warmup": 0.01, 63 | "checkpoint_factor": 10000, 64 | "eval_interval": 1000, 65 | "eval_iters": 10, 66 | 67 | # logging 68 | "log_interval": 100, 69 | "steps_per_print": 10, 70 | "keep_last_n_checkpoints": 4, 71 | "wall_clock_breakdown": true, 72 | } 73 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI contributors 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from typing import Optional 19 | from torch import Tensor 20 | 21 | # flags required to enable jit fusion kernels 22 | torch._C._jit_set_profiling_mode(False) 23 | torch._C._jit_set_profiling_executor(False) 24 | torch._C._jit_override_can_fuse_on_cpu(True) 25 | torch._C._jit_override_can_fuse_on_gpu(True) 26 | 27 | 28 | def bias_dropout_add( 29 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool 30 | ) -> Tensor: 31 | out = torch.nn.functional.dropout(x + bias, p=prob, training=training) 32 | if residual is not None: 33 | out = residual + out 34 | return out 35 | 36 | 37 | def get_bias_dropout_add(training): 38 | def _bias_dropout_add(x, bias, residual, prob): 39 | return bias_dropout_add(x, bias, residual, prob, training) 40 | 41 | return _bias_dropout_add 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_fused_train( 46 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add(x, bias, residual, prob, True) 49 | 50 | 51 | @torch.jit.script 52 | def bias_dropout_add_fused_inference( 53 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 54 | ) -> Tensor: 55 | return bias_dropout_add(x, bias, residual, prob, False) 56 | -------------------------------------------------------------------------------- /configs/autotuning_configs/tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | "num-layers": 12, 5 | "hidden-size": 768, 6 | "num-attention-heads": 12, 7 | "seq-length": 2048, 8 | "max-position-embeddings": 2048, 9 | "norm": "layernorm", 10 | "pos-emb": "rotary", 11 | "no-weight-tying": true, 12 | "scaled-upper-triang-masked-softmax-fusion": true, 13 | "bias-gelu-fusion": true, 14 | "optimizer": { 15 | "type": "Adam", 16 | "params": { 17 | "lr": 0.0006, 18 | "betas": [0.9, 0.999], 19 | "eps": 1.0e-8 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 0, 24 | "allgather_partitions": true, 25 | "allgather_bucket_size": 500000000, 26 | "overlap_comm": true, 27 | "reduce_scatter": true, 28 | "reduce_bucket_size": 500000000, 29 | "contiguous_gradients": true, 30 | "cpu_offload": false 31 | }, 32 | "train_micro_batch_size_per_gpu": 1, 33 | "autotuning_config": { 34 | "enabled": true, 35 | "arg_mappings": { 36 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 37 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 38 | } 39 | }, 40 | "data-impl": "mmap", 41 | "split": "949,50,1", 42 | "checkpoint-activations": true, 43 | "checkpoint-num-layers": 1, 44 | "partition-activations": true, 45 | "synchronize-each-layer": true, 46 | "gradient_clipping": 1.0, 47 | "weight-decay": 0.0, 48 | "hidden-dropout": 0.0, 49 | "attention-dropout": 0.0, 50 | "fp16": { 51 | "enabled": true, 52 | "loss_scale": 0, 53 | "loss_scale_window": 1000, 54 | "hysteresis": 2, 55 | "min_loss_scale": 1 56 | }, 57 | "train-iters": 200, 58 | "lr-decay-iters": 320000, 59 | "distributed-backend": "nccl", 60 | "lr-decay-style": "cosine", 61 | "warmup": 0.01, 62 | "save-interval": 10000, 63 | "eval-interval": 1000, 64 | "eval-iters": 10, 65 | "log-interval": 100, 66 | "steps_per_print": 10, 67 | "keep-last-n-checkpoints": 4, 68 | "wall_clock_breakdown": true, 69 | "launcher": "slurm", 70 | "deepspeed_slurm": true, 71 | "comment": "neox" 72 | } 73 | -------------------------------------------------------------------------------- /configs/pythia/70M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 6, 6 | "hidden_size": 512, 7 | "num_attention_heads": 8, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 6]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.001, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.0001, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 32, 46 | "data_impl": "mmap", 47 | "num_workers": 1, 48 | 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | "gradient_clipping": 1.0, 55 | "weight_decay": 0.1, 56 | "hidden_dropout": 0, 57 | "attention_dropout": 0, 58 | 59 | "fp16": { 60 | "fp16": true, 61 | "enabled": true, 62 | "loss_scale": 0, 63 | "loss_scale_window": 1000, 64 | "initial_scale_power": 12, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | "train_iters": 143000, 70 | "lr_decay_iters": 143000, 71 | "distributed_backend": "nccl", 72 | "lr_decay_style": "cosine", 73 | "warmup": 0.01, 74 | "checkpoint_factor": 1000, 75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 76 | "eval_interval": 100000, 77 | "eval_iters": 10, 78 | 79 | "log_interval": 10, 80 | "steps_per_print": 10, 81 | "wall_clock_breakdown": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /configs/pythia/160M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 12, 6 | "hidden_size": 768, 7 | "num_attention_heads": 12, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 12]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00006, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 32, 46 | "data_impl": "mmap", 47 | "num_workers": 1, 48 | 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | "gradient_clipping": 1.0, 55 | "weight_decay": 0.1, 56 | "hidden_dropout": 0, 57 | "attention_dropout": 0, 58 | 59 | "fp16": { 60 | "fp16": true, 61 | "enabled": true, 62 | "loss_scale": 0, 63 | "loss_scale_window": 1000, 64 | "initial_scale_power": 12, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | "train_iters": 143000, 70 | "lr_decay_iters": 143000, 71 | "distributed_backend": "nccl", 72 | "lr_decay_style": "cosine", 73 | "warmup": 0.01, 74 | "checkpoint_factor": 1000, 75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 76 | "eval_interval": 143000, 77 | "eval_iters": 10, 78 | 79 | "log_interval": 10, 80 | "steps_per_print": 10, 81 | "wall_clock_breakdown": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /configs/pythia/1-4B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 24, 6 | "hidden_size": 2048, 7 | "num_attention_heads": 16, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 24]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0002, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00002, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 16, 46 | "data_impl": "mmap", 47 | "num_workers": 1, 48 | 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | "gradient_clipping": 1.0, 55 | "weight_decay": 0.1, 56 | "hidden_dropout": 0, 57 | "attention_dropout": 0, 58 | 59 | "fp16": { 60 | "fp16": true, 61 | "enabled": true, 62 | "loss_scale": 0, 63 | "loss_scale_window": 1000, 64 | "initial_scale_power": 12, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | "train_iters": 143000, 70 | "lr_decay_iters": 143000, 71 | "distributed_backend": "nccl", 72 | "lr_decay_style": "cosine", 73 | "warmup": 0.01, 74 | "checkpoint_factor": 1000, 75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 76 | "eval_interval": 143000, 77 | "eval_iters": 10, 78 | 79 | 80 | "log_interval": 10, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /configs/pythia/410M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 24, 6 | "hidden_size": 1024, 7 | "num_attention_heads": 16, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 24]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0003, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00003, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 32, 46 | "data_impl": "mmap", 47 | "num_workers": 1, 48 | 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | "gradient_clipping": 1.0, 55 | "weight_decay": 0.1, 56 | "hidden_dropout": 0, 57 | "attention_dropout": 0, 58 | 59 | "fp16": { 60 | "fp16": true, 61 | "enabled": true, 62 | "loss_scale": 0, 63 | "loss_scale_window": 1000, 64 | "initial_scale_power": 12, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | "train_iters": 143000, 70 | "lr_decay_iters": 143000, 71 | "distributed_backend": "nccl", 72 | "lr_decay_style": "cosine", 73 | "warmup": 0.01, 74 | "checkpoint_factor": 1000, 75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 76 | "eval_interval": 143000, 77 | "eval_iters": 10, 78 | 79 | "log_interval": 10, 80 | "steps_per_print": 10, 81 | "wall_clock_breakdown": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /configs/pythia/6-9B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 2, 4 | 5 | "num_layers": 32, 6 | "hidden_size": 4096, 7 | "num_attention_heads": 32, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos_emb": "rotary", 12 | "rotary_pct": 0.25, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": true, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], 32]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": true, 21 | 22 | 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.00012, 27 | "betas": [0.9, 0.95], 28 | "eps": 1.0e-8 29 | } 30 | }, 31 | 32 | "min_lr": 0.000012, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 1260000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 1260000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 8, 46 | "gradient_accumulation_steps": 2, 47 | "data_impl": "mmap", 48 | 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | "gradient_clipping": 1.0, 55 | "weight_decay": 0.1, 56 | "hidden_dropout": 0, 57 | "attention_dropout": 0, 58 | 59 | "fp16": { 60 | "fp16": true, 61 | "enabled": true, 62 | "loss_scale": 0, 63 | "loss_scale_window": 1000, 64 | "initial_scale_power": 12, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | "train_iters": 143000, 70 | "lr_decay_iters": 143000, 71 | "distributed_backend": "nccl", 72 | "lr_decay_style": "cosine", 73 | "warmup": 0.01, 74 | "checkpoint_factor": 1000, 75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 76 | "eval_interval": 143000, 77 | "eval_iters": 10, 78 | 79 | "log_interval": 10, 80 | "steps_per_print": 10, 81 | "wall_clock_breakdown": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /configs/autotuning_configs/small_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | "num-layers": 12, 6 | "hidden-size": 768, 7 | "num-attention-heads": 12, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | 14 | "scaled-upper-triang-masked-softmax-fusion": false, 15 | "bias-gelu-fusion": false, 16 | 17 | 18 | "optimizer": { 19 | "type": "Adam", 20 | "params": { 21 | "lr": 0.0006, 22 | "betas": [0.9, 0.999], 23 | "eps": 1.0e-8 24 | } 25 | }, 26 | 27 | "train_micro_batch_size_per_gpu": 1, 28 | "data-impl": "mmap", 29 | "split": "949,50,1", 30 | 31 | "checkpoint-activations": true, 32 | "checkpoint-num-layers": 1, 33 | "partition-activations": true, 34 | "synchronize-each-layer": true, 35 | 36 | "gradient_clipping": 1.0, 37 | "weight-decay": 0.0, 38 | "hidden-dropout": 0.0, 39 | "attention-dropout": 0.0, 40 | 41 | "fp16": { 42 | "enabled": true, 43 | "loss_scale": 0, 44 | "loss_scale_window": 1000, 45 | "hysteresis": 2, 46 | "min_loss_scale": 1 47 | }, 48 | 49 | "train-iters": 320000, 50 | "lr-decay-iters": 320000, 51 | "distributed-backend": "nccl", 52 | "lr-decay-style": "cosine", 53 | "warmup": 0.01, 54 | "save-interval": 10000, 55 | "eval-interval": 1000, 56 | "eval-iters": 10, 57 | 58 | "log-interval": 100, 59 | "steps_per_print": 10, 60 | "keep-last-n-checkpoints": 4, 61 | "wall_clock_breakdown": true, 62 | "launcher": "slurm", 63 | "deepspeed_slurm": true, 64 | "comment": "neox", 65 | "autotuning": { 66 | "enabled": true, 67 | "arg_mappings": { 68 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 69 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 70 | } 71 | }, 72 | "zero_optimization": { 73 | "stage": [0, 1, 2, 3] 74 | }, 75 | "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"], 76 | "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"], 77 | "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"] 78 | } 79 | -------------------------------------------------------------------------------- /configs/pythia/12B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 4, 4 | 5 | "num_layers": 36, 6 | "hidden_size": 5120, 7 | "num_attention_heads": 40, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos_emb": "rotary", 12 | "rotary_pct": 0.25, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": true, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], 36]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": true, 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.00012, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.000012, 31 | 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 1260000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 1260000000, 39 | "contiguous_gradients": true, 40 | "cpu_offload": false 41 | }, 42 | 43 | "train_micro_batch_size_per_gpu": 8, 44 | "gradient_accumulation_steps": 2, 45 | "data_impl": "mmap", 46 | 47 | "checkpoint_activations": true, 48 | "checkpoint_num_layers": 1, 49 | "partition_activations": true, 50 | "synchronize_each_layer": true, 51 | 52 | "gradient_clipping": 1.0, 53 | "weight_decay": 0.1, 54 | "hidden_dropout": 0, 55 | "attention_dropout": 0, 56 | 57 | "fp16": { 58 | "fp16": true, 59 | "enabled": true, 60 | "loss_scale": 0, 61 | "loss_scale_window": 1000, 62 | "initial_scale_power": 12, 63 | "hysteresis": 2, 64 | "min_loss_scale": 1 65 | }, 66 | 67 | "train_iters": 143000, 68 | "lr_decay_iters": 143000, 69 | "distributed_backend": "nccl", 70 | "lr_decay_style": "cosine", 71 | "warmup": 0.01, 72 | "checkpoint_factor": 1000, 73 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 74 | "eval_interval": 143000, 75 | "eval_iters": 10, 76 | 77 | "log_interval": 10, 78 | "steps_per_print": 10, 79 | "wall_clock_breakdown": true, 80 | 81 | "log_grad_norm": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /configs/pythia/1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 16, 6 | "hidden_size": 2048, 7 | "num_attention_heads": 8, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": true, 17 | "bias_gelu_fusion": true, 18 | 19 | "init_method": "small_init", 20 | "output_layer_init_method": "wang_init", 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.00025, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.000025, 31 | 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": true, 40 | "cpu_offload": false 41 | }, 42 | 43 | "fp16": { 44 | "enabled": true, 45 | "type": "bfloat16", 46 | "auto_cast": true, 47 | "loss_scale": 0, 48 | "loss_scale_window": 1000, 49 | "initial_scale_power": 12, 50 | "hysteresis": 2, 51 | "min_loss_scale": 1 52 | }, 53 | 54 | "fp32_allreduce": true, 55 | 56 | "train_micro_batch_size_per_gpu": 4, 57 | "gradient_accumulation_steps": 4, 58 | "data_impl": "mmap", 59 | "num_workers": 1, 60 | 61 | "checkpoint_activations": true, 62 | "checkpoint_num_layers": 1, 63 | "partition_activations": true, 64 | "synchronize_each_layer": true, 65 | 66 | "gradient_clipping": 1.0, 67 | "weight_decay": 0.1, 68 | "hidden_dropout": 0, 69 | "attention_dropout": 0, 70 | 71 | "train_iters": 143000, 72 | "lr_decay_iters": 143000, 73 | "distributed_backend": "nccl", 74 | "lr_decay_style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint_factor": 1000, 77 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 78 | "eval_interval": 143000, 79 | "eval_iters": 10, 80 | 81 | "log_interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | "tokenizer_type": "HFTokenizer" 86 | } 87 | -------------------------------------------------------------------------------- /configs/pythia/2-8B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 32, 6 | "hidden_size": 2560, 7 | "num_attention_heads": 32, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 32]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.00016, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.000016, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 8, 46 | "gradient_accumulation_steps": 2, 47 | "data_impl": "mmap", 48 | "num_workers": 1, 49 | 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.1, 57 | "hidden_dropout": 0, 58 | "attention_dropout": 0, 59 | 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train_iters": 143000, 71 | "lr_decay_iters": 143000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint_factor": 1000, 76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 77 | "eval_interval": 40000, 78 | "eval_iters": 10, 79 | 80 | "log_grad_norm": true, 81 | 82 | "log_interval": 10, 83 | "steps_per_print": 10, 84 | "wall_clock_breakdown": true, 85 | 86 | "tokenizer_type": "HFTokenizer" 87 | } 88 | -------------------------------------------------------------------------------- /configs/800M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | # model settings 6 | "num_layers": 16, 7 | "hidden_size": 2048, 8 | "num_attention_heads": 8, 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "pos_emb": "rotary", 12 | "no_weight_tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": false, 17 | "bias_gelu_fusion": false, 18 | "rope_fusion": false, 19 | "layernorm_fusion": false, 20 | 21 | # init methods 22 | "init_method": "small_init", 23 | "output_layer_init_method": "wang_init", 24 | 25 | "optimizer": { 26 | "type": "Adam", 27 | "params": { 28 | "lr": 0.00025, 29 | "betas": [0.9, 0.95], 30 | "eps": 1.0e-8, 31 | } 32 | }, 33 | "min_lr": 0.000025, 34 | 35 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": True, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": True, 41 | "reduce_scatter": True, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": True, 44 | }, 45 | 46 | "train_micro_batch_size_per_gpu": 16, 47 | "gradient_accumulation_steps": 1, 48 | "data_impl": "mmap", 49 | "num_workers": 1, 50 | 51 | # activation checkpointing 52 | "checkpoint_activations": true, 53 | "checkpoint_num_layers": 1, 54 | "partition_activations": true, 55 | "synchronize_each_layer": true, 56 | 57 | # regularization 58 | "gradient_clipping": 1.0, 59 | "weight_decay": 0.1, 60 | "hidden_dropout": 0, 61 | "attention_dropout": 0, 62 | 63 | # precision settings 64 | "fp16": { 65 | "fp16": true, 66 | "enabled": true, 67 | "loss_scale": 0, 68 | "loss_scale_window": 1000, 69 | "initial_scale_power": 12, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1, 72 | }, 73 | 74 | "train_iters": 143000, 75 | "lr_decay_iters": 143000, 76 | "distributed_backend": "nccl", 77 | "lr_decay_style": "cosine", 78 | "warmup": 0.01, 79 | "checkpoint_factor": 1000, 80 | "eval_interval": 40000, 81 | "eval_iters": 10, 82 | 83 | "log_interval": 10, 84 | "steps_per_print": 10, 85 | "wall_clock_breakdown": true, 86 | } 87 | -------------------------------------------------------------------------------- /tests/config/test_setup.yml: -------------------------------------------------------------------------------- 1 | # 19M parameter model, & local setup with some additional simplifications 2 | { 3 | # Settings to make the test setup as lightweight as possible 4 | "data_path": "data/enwik8/enwik8_text_document", 5 | "vocab_file": "data/gpt2-vocab.json", 6 | "merge_file": "data/gpt2-merges.txt", 7 | "lr_decay_iters": 20, 8 | "train_iters": 20, 9 | "hostfile": "None", 10 | "include": "localhost:1", 11 | "use_wandb": False, 12 | 13 | # Settings copied from 19M parameter config (some modifications above, meaning we can't use configs/19M.yml directly) 14 | "pipe_parallel_size": 1, 15 | "model_parallel_size": 1, 16 | 17 | # model settings 18 | "num_layers": 2, 19 | "hidden_size": 8, 20 | "num_attention_heads": 4, 21 | "seq_length": 1024, 22 | "max_position_embeddings": 1024, 23 | "pos_emb": "rotary", 24 | "no_weight_tying": true, 25 | "gpt_j_residual": false, 26 | "output_layer_parallelism": "column", 27 | 28 | "scaled_upper_triang_masked_softmax_fusion": false, 29 | "bias_gelu_fusion": false, 30 | "rope_fusion": false, 31 | "layernorm_fusion": false, 32 | 33 | # Optimizer 34 | "optimizer": { 35 | "type": "sm3", 36 | "params": {}, 37 | }, 38 | 39 | # precision 40 | "precision": "fp16", 41 | 42 | # init methods 43 | "init_method": "small_init", 44 | "output_layer_init_method": "wang_init", 45 | 46 | "train_micro_batch_size_per_gpu": 4, 47 | "gradient_accumulation_steps": 1, 48 | "data_impl": "mmap", 49 | "num_workers": 1, 50 | 51 | # activation checkpointing 52 | "checkpoint_activations": true, 53 | "checkpoint_num_layers": 1, 54 | "partition_activations": true, 55 | "synchronize_each_layer": true, 56 | 57 | # regularization 58 | "gradient_clipping": 1.0, 59 | "weight_decay": 0.1, 60 | "hidden_dropout": 0, 61 | "attention_dropout": 0, 62 | 63 | "distributed_backend": "nccl", 64 | "lr_decay_style": "cosine", 65 | "warmup": 0.01, 66 | "checkpoint_factor": 1000, 67 | "eval_interval": 100000, 68 | "eval_iters": 10, 69 | 70 | "log_interval": 10, 71 | "steps_per_print": 10, 72 | "wall_clock_breakdown": true, 73 | 74 | # additional deepspeed args not specified above 75 | "deepspeed_extra_args": { 76 | "comms_logger": { 77 | "enabled": true, 78 | "verbose": true, 79 | "prof_all": true, 80 | "debug": false 81 | }, 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /configs/finetuning_configs/6-9B.yml: -------------------------------------------------------------------------------- 1 | { 2 | # finetuning option 3 | "load": "/path/to/checkpoint", 4 | "finetune": true, 5 | 6 | "pipe-parallel-size": 1, 7 | "model-parallel-size": 2, 8 | 9 | "num-layers": 32, 10 | "hidden-size": 4096, 11 | "num-attention-heads": 32, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "rotary_pct": 0.25, 17 | "no-weight-tying": true, 18 | "gpt_j_residual": true, 19 | "output_layer_parallelism": "column", 20 | 21 | "attention-config": [[["flash"], 32]], 22 | 23 | "scaled-upper-triang-masked-softmax-fusion": true, 24 | "bias-gelu-fusion": true, 25 | 26 | 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00012, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | 36 | "min_lr": 0.000012, 37 | 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": true, 41 | "allgather_bucket_size": 1260000000, 42 | "overlap_comm": true, 43 | "reduce_scatter": true, 44 | "reduce_bucket_size": 1260000000, 45 | "contiguous_gradients": true, 46 | "cpu_offload": false, 47 | "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params 48 | }, 49 | 50 | "train_micro_batch_size_per_gpu": 8, 51 | "gradient_accumulation_steps": 2, 52 | "data-impl": "mmap", 53 | 54 | "checkpoint-activations": true, 55 | "checkpoint-num-layers": 1, 56 | "partition-activations": true, 57 | "synchronize-each-layer": true, 58 | 59 | "gradient_clipping": 1.0, 60 | "weight-decay": 0.1, 61 | "hidden-dropout": 0, 62 | "attention-dropout": 0, 63 | 64 | "fp16": { 65 | "fp16": true, 66 | "enabled": true, 67 | "loss_scale": 0, 68 | "loss_scale_window": 1000, 69 | "initial_scale_power": 12, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1 72 | }, 73 | 74 | "train-iters": 143000, 75 | "lr-decay-iters": 143000, 76 | "distributed-backend": "nccl", 77 | "lr-decay-style": "cosine", 78 | "warmup": 0.01, 79 | "checkpoint-factor": 1000, 80 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512], 81 | "eval-interval": 143000, 82 | "eval-iters": 10, 83 | 84 | "log-interval": 10, 85 | "steps_per_print": 10, 86 | "wall_clock_breakdown": true, 87 | 88 | "tokenizer_type": "HFTokenizer" 89 | } 90 | -------------------------------------------------------------------------------- /configs/autotuning_configs/tune_1-3B.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | "num-layers": 24, 6 | "hidden-size": 2048, 7 | "num-attention-heads": 16, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | "attention_config": [[["flash"], 24]], 16 | "scaled-upper-triang-masked-softmax-fusion": false, 17 | "bias-gelu-fusion": false, 18 | 19 | "init_method": "small_init", 20 | "output_layer_init_method": "wang_init", 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0002, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.00002, 31 | 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": true 40 | }, 41 | "train_micro_batch_size_per_gpu": 1, 42 | "autotuning": { 43 | "enabled": true, 44 | "arg_mappings": { 45 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 46 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 47 | } 48 | }, 49 | "data-impl": "mmap", 50 | 51 | "checkpoint-activations": false, 52 | "checkpoint-num-layers": 1, 53 | "partition-activations": true, 54 | "synchronize-each-layer": true, 55 | 56 | "gradient_clipping": 1.0, 57 | "weight-decay": 0.1, 58 | "hidden-dropout": 0, 59 | "attention-dropout": 0, 60 | 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint-factor": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | "launcher": "slurm", 79 | "deepspeed_slurm": true, 80 | "no_ssh_check": true, 81 | 82 | "log-interval": 10, 83 | "steps_per_print": 10, 84 | "keep-last-n-checkpoints": 1, 85 | "wall_clock_breakdown": true 86 | } 87 | -------------------------------------------------------------------------------- /configs/bf16_125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 12, 10 | "hidden_size": 768, 11 | "num_attention_heads": 12, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled_upper_triang_masked_softmax_fusion": false, 20 | "bias_gelu_fusion": false, 21 | "rope_fusion": false, 22 | "layernorm_fusion": false, 23 | 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.0006, 30 | "betas": [0.9, 0.999], 31 | "eps": 1.0e-8, 32 | } 33 | }, 34 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 35 | "zero_optimization": { 36 | "stage": 0, 37 | "allgather_partitions": True, 38 | "allgather_bucket_size": 500000000, 39 | "overlap_comm": True, 40 | "reduce_scatter": True, 41 | "reduce_bucket_size": 500000000, 42 | "contiguous_gradients": True, 43 | }, 44 | 45 | # batch / data settings 46 | "train_micro_batch_size_per_gpu": 4, 47 | "data_impl": "mmap", 48 | "split": "949,50,1", 49 | 50 | # activation checkpointing 51 | "checkpoint_activations": true, 52 | "checkpoint_num_layers": 1, 53 | "partition_activations": true, 54 | "synchronize_each_layer": true, 55 | 56 | # regularization 57 | "gradient_clipping": 1.0, 58 | "weight_decay": 0.0, 59 | "hidden_dropout": 0.0, 60 | "attention_dropout": 0.0, 61 | 62 | "precision": "bfloat16", 63 | 64 | "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32 65 | # misc. training settings 66 | "train_iters": 320000, 67 | "lr_decay_iters": 320000, 68 | "distributed_backend": "nccl", 69 | "lr_decay_style": "cosine", 70 | "warmup": 0.01, 71 | "checkpoint_factor": 10000, 72 | "eval_interval": 1000, 73 | "eval_iters": 10, 74 | 75 | # logging 76 | "log_interval": 100, 77 | "steps_per_print": 10, 78 | "keep_last_n_checkpoints": 4, 79 | "wall_clock_breakdown": true, 80 | } 81 | -------------------------------------------------------------------------------- /post-training/recreating_zephyr_dpo.md: -------------------------------------------------------------------------------- 1 | # Initial setup 2 | 3 | ```bash 4 | python tools/ckpts/convert_hf_llama_to_neox.py --tp 2 --model HuggingFaceH4/mistral-7b-sft-beta --model_path checkpoints/neox_converted/zephyr-sft_tp2 5 | ``` 6 | 7 | 8 | # To generate data 9 | First make a new environment... We want to keep the same data between runs so the easiest way is to create a new conda 10 | environment and follow the steps below. 11 | ``` 12 | conda create -n handbook python=3.10 && conda activate handbook 13 | git clone https://github.com/huggingface/alignment-handbook.git 14 | cd ./alignment-handbook/ 15 | python -m pip install . 16 | python -m pip install jsonlines 17 | ``` 18 | 19 | ## DPO data 20 | ```bash 21 | # from the gpt-neox repo 22 | conda activate handbook 23 | python post-training/dpo_data.py 24 | conda deactivate 25 | # activate your neox conda environment, or whatever you need to switch to the neox environment 26 | mkdir data 27 | mkdir data/pairwise 28 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last 29 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last 30 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last 31 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last 32 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last 33 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last 34 | ``` 35 | 36 | ## Running 37 | ```bash 38 | python deepy.py train.py post-training/configs/benchmarking/mistral-dpo.yml 39 | ``` 40 | -------------------------------------------------------------------------------- /configs/mamba/mamba-130M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # Parallelism is not yet supported for Mamba 3 | "pipe_parallel_size": 0, 4 | "model_parallel_size": 1, 5 | 6 | "num_layers": 24, 7 | "hidden_size": 768, 8 | "num_attention_heads": 12, # ignored when using mamba 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "output_layer_parallelism": "column", 12 | "norm": "rmsnorm", 13 | "rms_norm_epsilon": 1.0e-5, 14 | 15 | "attention_config": [[["mamba"], 24]], 16 | 17 | "mamba_selective_scan_fusion": true, 18 | "mamba_causal_conv_fusion": true, 19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion 20 | "activation": "silu", 21 | 22 | # init methods 23 | "init_method": "small_init", 24 | "output_layer_init_method": "single_residual_scaled_normal", 25 | 26 | 27 | # optimizer settings 28 | "optimizer": { 29 | "type": "Adam", 30 | "params": { 31 | "lr": 0.0006, 32 | "betas": [0.9, 0.95], 33 | "eps": 1.0e-8, 34 | } 35 | }, 36 | "min_lr": 0.00006, 37 | 38 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 39 | "zero_optimization": { 40 | "stage": 1, 41 | "allgather_partitions": True, 42 | "allgather_bucket_size": 500000000, 43 | "overlap_comm": True, 44 | "reduce_scatter": True, 45 | "reduce_bucket_size": 500000000, 46 | "contiguous_gradients": True, 47 | }, 48 | 49 | # batch / data settings 50 | "train_micro_batch_size_per_gpu": 4, 51 | "data_impl": "mmap", 52 | 53 | # activation checkpointing 54 | "checkpoint_activations": true, 55 | "checkpoint_num_layers": 1, 56 | "partition_activations": true, 57 | "synchronize_each_layer": true, 58 | 59 | # regularization 60 | "gradient_clipping": 1.0, 61 | "weight_decay": 0.1, 62 | "hidden_dropout": 0.0, 63 | "attention_dropout": 0.0, 64 | 65 | # precision settings 66 | "fp16": { 67 | "enabled": true, 68 | "loss_scale": 0, 69 | "loss_scale_window": 1000, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1 72 | }, 73 | 74 | # misc. training settings 75 | "train_iters": 320000, 76 | "lr_decay_iters": 320000, 77 | "distributed_backend": "nccl", 78 | "lr_decay_style": "cosine", 79 | "warmup": 0.01, 80 | "checkpoint_factor": 10000, 81 | "eval_interval": 1000, 82 | "eval_iters": 10, 83 | 84 | # logging 85 | "log_interval": 100, 86 | "steps_per_print": 10, 87 | "keep_last_n_checkpoints": 4, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /configs/mamba/mamba-370M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # Parallelism is not yet supported for Mamba 3 | "pipe_parallel_size": 0, 4 | "model_parallel_size": 1, 5 | 6 | "num_layers": 48, 7 | "hidden_size": 1024, 8 | "num_attention_heads": 12, # ignored when using mamba 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "output_layer_parallelism": "column", 12 | "norm": "rmsnorm", 13 | "rms_norm_epsilon": 1.0e-5, 14 | 15 | "attention_config": [[["mamba"], 48]], 16 | 17 | "mamba_selective_scan_fusion": true, 18 | "mamba_causal_conv_fusion": true, 19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion 20 | "activation": "silu", 21 | 22 | # init methods 23 | "init_method": "small_init", 24 | "output_layer_init_method": "single_residual_scaled_normal", 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.0003, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8, 33 | } 34 | }, 35 | "min_lr": 0.00003, 36 | 37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": True, 41 | "allgather_bucket_size": 500000000, 42 | "overlap_comm": True, 43 | "reduce_scatter": True, 44 | "reduce_bucket_size": 500000000, 45 | "contiguous_gradients": True, 46 | }, 47 | # batch / data settings 48 | "train_micro_batch_size_per_gpu": 4, 49 | "data_impl": "mmap", 50 | 51 | # activation checkpointing 52 | "checkpoint_activations": true, 53 | "checkpoint_num_layers": 1, 54 | "partition_activations": true, 55 | "synchronize_each_layer": true, 56 | 57 | # regularization 58 | "gradient_clipping": 1.0, 59 | "weight_decay": 0.1, 60 | "hidden_dropout": 0, 61 | "attention_dropout": 0, 62 | 63 | # precision settings 64 | "fp16": { 65 | "fp16": true, 66 | "enabled": true, 67 | "loss_scale": 0, 68 | "loss_scale_window": 1000, 69 | "hysteresis": 2, 70 | "min_loss_scale": 1 71 | }, 72 | 73 | # misc. training settings 74 | "train_iters": 320000, 75 | "lr_decay_iters": 320000, 76 | "distributed_backend": "nccl", 77 | "lr_decay_style": "cosine", 78 | "warmup": 0.01, 79 | "checkpoint_factor": 10000, 80 | "eval_interval": 1000, 81 | "eval_iters": 10, 82 | 83 | # logging 84 | "log_interval": 100, 85 | "steps_per_print": 10, 86 | "keep_last_n_checkpoints": 4, 87 | "wall_clock_breakdown": true, 88 | } 89 | -------------------------------------------------------------------------------- /tools/datasets/multinode_prepare_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # USAGE: 4 | # This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks 5 | # over the processes. 6 | # This bash script takes a single text file as input argument. 7 | # The text file contains a valid filepath in each line, leading to a jsonl-file. 8 | # Furthermore an environment variable for the rank and the world size needs to be set. 9 | # These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well 10 | # using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables. 11 | # You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through. 12 | 13 | # Parse command-line arguments 14 | text_file="$1" 15 | rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}" 16 | world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}" 17 | num_lines=$(wc -l < "$text_file") 18 | chunk_size=$((num_lines / world_size)) 19 | start_line=$((rank * chunk_size + 1)) 20 | end_line=$((start_line + chunk_size - 1)) 21 | 22 | # Make sure the last chunk includes all remaining lines 23 | if [[ $rank == $((world_size - 1)) ]]; then 24 | end_line=$num_lines 25 | fi 26 | 27 | # Select the chunk of the text file that corresponds to the rank 28 | chunk_file="chunk_${rank}.txt" 29 | sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file" 30 | 31 | # Parse additional flags to be passed to the Python script 32 | shift 1 # Shift past the first three arguments 33 | py_args="" 34 | prefix_arg="" 35 | while [[ $# -gt 0 ]]; do 36 | case "$1" in 37 | --output-prefix=*) prefix_arg="$1"; shift;; 38 | --output-prefix) prefix_arg="$1 $2"; shift 2;; 39 | --*) py_args="$py_args $1 $2"; shift 2;; 40 | *) echo "Unknown argument: $1"; exit 1;; 41 | esac 42 | done 43 | 44 | # Add the rank to the --output-prefix argument if it is set 45 | if [[ -n "$prefix_arg" ]]; then 46 | py_args="$py_args $prefix_arg$rank" 47 | else 48 | # Inject a default --output-prefix argument containing the rank 49 | py_args="$py_args --output-prefix rank${rank}" 50 | fi 51 | 52 | 53 | echo "processing $chunk_file with rank $rank at world size $world_size" 54 | echo "using the following args: $py_args" 55 | # Call the Python script with the list of file paths in the chunk 56 | python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args 57 | 58 | # Clean up 59 | rm "$chunk_file" 60 | -------------------------------------------------------------------------------- /configs/mamba/mamba-1.4B.yml: -------------------------------------------------------------------------------- 1 | { 2 | # Parallelism is not yet supported for Mamba 3 | "pipe_parallel_size": 0, 4 | "model_parallel_size": 1, 5 | 6 | "num_layers": 48, 7 | "hidden_size": 2048, 8 | "num_attention_heads": 12, # ignored when using mamba 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "output_layer_parallelism": "column", 12 | "norm": "rmsnorm", 13 | "rms_norm_epsilon": 1.0e-5, 14 | 15 | "attention_config": [[["mamba"], 48]], 16 | 17 | "mamba_selective_scan_fusion": true, 18 | "mamba_causal_conv_fusion": true, 19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion 20 | "activation": "silu", 21 | 22 | # init methods 23 | "init_method": "small_init", 24 | "output_layer_init_method": "single_residual_scaled_normal", 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.0002, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8, 33 | } 34 | }, 35 | "min_lr": 0.00002, 36 | 37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": True, 41 | "allgather_bucket_size": 500000000, 42 | "overlap_comm": True, 43 | "reduce_scatter": True, 44 | "reduce_bucket_size": 500000000, 45 | "contiguous_gradients": True, 46 | }, 47 | 48 | # batch / data settings 49 | "train_micro_batch_size_per_gpu": 4, 50 | "data_impl": "mmap", 51 | 52 | # activation checkpointing 53 | "checkpoint_activations": true, 54 | "checkpoint_num_layers": 1, 55 | "partition_activations": true, 56 | "synchronize_each_layer": true, 57 | 58 | # regularization 59 | "gradient_clipping": 1.0, 60 | "weight_decay": 0.1, 61 | "hidden_dropout": 0, 62 | "attention_dropout": 0, 63 | 64 | # precision settings 65 | "fp16": { 66 | "fp16": true, 67 | "enabled": true, 68 | "loss_scale": 0, 69 | "loss_scale_window": 1000, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1 72 | }, 73 | 74 | # misc. training settings 75 | "train_iters": 320000, 76 | "lr_decay_iters": 320000, 77 | "distributed_backend": "nccl", 78 | "lr_decay_style": "cosine", 79 | "warmup": 0.01, 80 | "checkpoint_factor": 10000, 81 | "eval_interval": 1000, 82 | "eval_iters": 10, 83 | 84 | # logging 85 | "log_interval": 1, 86 | "steps_per_print": 10, 87 | "keep_last_n_checkpoints": 4, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /configs/mamba/mamba-2.8B.yml: -------------------------------------------------------------------------------- 1 | { 2 | # Parallelism is not yet supported for Mamba 3 | "pipe_parallel_size": 0, 4 | "model_parallel_size": 1, 5 | 6 | "num_layers": 64, 7 | "hidden_size": 2560, 8 | "num_attention_heads": 12, # ignored when using mamba 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "output_layer_parallelism": "column", 12 | "norm": "rmsnorm", 13 | "rms_norm_epsilon": 1.0e-5, 14 | 15 | "attention_config": [[["mamba"], 64]], 16 | 17 | "mamba_selective_scan_fusion": true, 18 | "mamba_causal_conv_fusion": true, 19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion 20 | "activation": "silu", 21 | 22 | # init methods 23 | "init_method": "small_init", 24 | "output_layer_init_method": "single_residual_scaled_normal", 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00016, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8, 33 | } 34 | }, 35 | "min_lr": 0.000016, 36 | 37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": True, 41 | "allgather_bucket_size": 500000000, 42 | "overlap_comm": True, 43 | "reduce_scatter": True, 44 | "reduce_bucket_size": 500000000, 45 | "contiguous_gradients": True, 46 | }, 47 | 48 | # batch / data settings 49 | "train_micro_batch_size_per_gpu": 4, 50 | "data_impl": "mmap", 51 | 52 | # activation checkpointing 53 | "checkpoint_activations": true, 54 | "checkpoint_num_layers": 1, 55 | "partition_activations": true, 56 | "synchronize_each_layer": true, 57 | 58 | # regularization 59 | "gradient_clipping": 1.0, 60 | "weight_decay": 0.1, 61 | "hidden_dropout": 0, 62 | "attention_dropout": 0, 63 | 64 | # precision settings 65 | "fp16": { 66 | "fp16": true, 67 | "enabled": true, 68 | "loss_scale": 0, 69 | "loss_scale_window": 1000, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1 72 | }, 73 | 74 | # misc. training settings 75 | "train_iters": 320000, 76 | "lr_decay_iters": 320000, 77 | "distributed_backend": "nccl", 78 | "lr_decay_style": "cosine", 79 | "warmup": 0.01, 80 | "checkpoint_factor": 10000, 81 | "eval_interval": 1000, 82 | "eval_iters": 10, 83 | 84 | # logging 85 | "log_interval": 100, 86 | "steps_per_print": 10, 87 | "keep_last_n_checkpoints": 4, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /configs/mamba/mamba-790M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # Parallelism is not yet supported for Mamba 3 | "pipe_parallel_size": 0, 4 | "model_parallel_size": 1, 5 | 6 | "num_layers": 48, 7 | "hidden_size": 1536, 8 | "num_attention_heads": 12, # ignored when using mamba 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "output_layer_parallelism": "column", 12 | "norm": "rmsnorm", 13 | "rms_norm_epsilon": 1.0e-5, 14 | 15 | "attention_config": [[["mamba"], 48]], 16 | 17 | "mamba_selective_scan_fusion": true, 18 | "mamba_causal_conv_fusion": true, 19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion 20 | "activation": "silu", 21 | 22 | # init methods 23 | "init_method": "small_init", 24 | "output_layer_init_method": "single_residual_scaled_normal", 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00025, 31 | "betas": [0.9, 0.999], 32 | "eps": 1.0e-8, 33 | } 34 | }, 35 | "min_lr": 0.000025, 36 | 37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": True, 41 | "allgather_bucket_size": 500000000, 42 | "overlap_comm": True, 43 | "reduce_scatter": True, 44 | "reduce_bucket_size": 500000000, 45 | "contiguous_gradients": True, 46 | }, 47 | 48 | # batch / data settings 49 | "train_micro_batch_size_per_gpu": 4, 50 | "data_impl": "mmap", 51 | 52 | # activation checkpointing 53 | "checkpoint_activations": true, 54 | "checkpoint_num_layers": 1, 55 | "partition_activations": true, 56 | "synchronize_each_layer": true, 57 | 58 | # regularization 59 | "gradient_clipping": 1.0, 60 | "weight_decay": 0.1, 61 | "hidden_dropout": 0, 62 | "attention_dropout": 0, 63 | 64 | # precision settings 65 | "fp16": { 66 | "fp16": true, 67 | "enabled": true, 68 | "loss_scale": 0, 69 | "loss_scale_window": 1000, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1 72 | }, 73 | 74 | # misc. training settings 75 | "train_iters": 320000, 76 | "lr_decay_iters": 320000, 77 | "distributed_backend": "nccl", 78 | "lr_decay_style": "cosine", 79 | "warmup": 0.01, 80 | "checkpoint_factor": 10000, 81 | "eval_interval": 1000, 82 | "eval_iters": 10, 83 | 84 | # logging 85 | "log_interval": 100, 86 | "steps_per_print": 10, 87 | "keep_last_n_checkpoints": 4, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /configs/49M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings 3 | "pipe_parallel_size": 1, 4 | "model_parallel_size": 1, 5 | 6 | # model settings 7 | "num_layers": 10, 8 | "hidden_size": 640, 9 | "num_attention_heads": 10, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 0.25, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": true, 16 | "output_layer_parallelism": "column", 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled_upper_triang_masked_softmax_fusion": false, 20 | "bias_gelu_fusion": false, 21 | "rope_fusion": false, 22 | "layernorm_fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.0008, 33 | "betas": [0.9, 0.95], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | "min_lr": 0.00008, 38 | 39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 40 | "zero_optimization": { 41 | "stage": 1, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | # batch / data settings 51 | "train_micro_batch_size_per_gpu": 32, 52 | "gradient_accumulation_steps": 1, 53 | "data_impl": "mmap", 54 | "num_workers": 1, 55 | 56 | # activation checkpointing 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | # regularization 63 | "gradient_clipping": 1.0, 64 | "weight_decay": 0.1, 65 | "hidden_dropout": 0, 66 | "attention_dropout": 0, 67 | 68 | # precision settings 69 | "fp16": { 70 | "fp16": true, 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "initial_scale_power": 12, 75 | "hysteresis": 2, 76 | "min_loss_scale": 1, 77 | }, 78 | 79 | # misc. training settings 80 | "train_iters": 143000, 81 | "lr_decay_iters": 143000, 82 | "distributed_backend": "nccl", 83 | "lr_decay_style": "cosine", 84 | "warmup": 0.01, 85 | "checkpoint_factor": 1000, 86 | "eval_interval": 100000, 87 | "eval_iters": 10, 88 | 89 | # logging 90 | "log_interval": 10, 91 | "steps_per_print": 10, 92 | "wall_clock_breakdown": true, 93 | } 94 | -------------------------------------------------------------------------------- /configs/bnb_125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 12, 10 | "hidden_size": 768, 11 | "num_attention_heads": 12, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "use_bnb_optimizer": true, 18 | 19 | # these should provide some speedup but takes a while to build, set to true if desired 20 | "scaled_upper_triang_masked_softmax_fusion": false, 21 | "bias_gelu_fusion": false, 22 | "rope_fusion": false, 23 | "layernorm_fusion": false, 24 | 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.0006, 31 | "betas": [0.9, 0.999], 32 | "eps": 1.0e-8, 33 | } 34 | }, 35 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 36 | "zero_optimization": { 37 | "stage": 0, 38 | "allgather_partitions": True, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": True, 41 | "reduce_scatter": True, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": True, 44 | }, 45 | 46 | # batch / data settings 47 | "train_micro_batch_size_per_gpu": 4, 48 | "data_impl": "mmap", 49 | "split": "949,50,1", 50 | 51 | # activation checkpointing 52 | "checkpoint_activations": true, 53 | "checkpoint_num_layers": 1, 54 | "partition_activations": true, 55 | "synchronize_each_layer": true, 56 | 57 | # regularization 58 | "gradient_clipping": 1.0, 59 | "weight_decay": 0.0, 60 | "hidden_dropout": 0.0, 61 | "attention_dropout": 0.0, 62 | 63 | # precision settings 64 | "fp16": { 65 | "enabled": true, 66 | "loss_scale": 0, 67 | "loss_scale_window": 1000, 68 | "hysteresis": 2, 69 | "min_loss_scale": 1 70 | }, 71 | 72 | # misc. training settings 73 | "train_iters": 320000, 74 | "lr_decay_iters": 320000, 75 | "distributed_backend": "nccl", 76 | "lr_decay_style": "cosine", 77 | "warmup": 0.01, 78 | "checkpoint_factor": 10000, 79 | "eval_interval": 1000, 80 | "eval_iters": 10, 81 | 82 | # logging 83 | "log_interval": 100, 84 | "steps_per_print": 10, 85 | "keep_last_n_checkpoints": 4, 86 | "wall_clock_breakdown": true, 87 | } 88 | -------------------------------------------------------------------------------- /configs/19M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | # model settings 6 | "num_layers": 6, 7 | "hidden_size": 512, 8 | "num_attention_heads": 8, 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "pos_emb": "rotary", 12 | "no_weight_tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": false, 17 | "bias_gelu_fusion": false, 18 | "rope_fusion": false, 19 | "layernorm_fusion": false, 20 | 21 | # init methods 22 | "init_method": "small_init", 23 | "output_layer_init_method": "wang_init", 24 | 25 | "optimizer": { 26 | "type": "Adam", 27 | "params": { 28 | "lr": 0.001, 29 | "betas": [0.9, 0.95], 30 | "eps": 1.0e-8, 31 | } 32 | }, 33 | "min_lr": 0.0001, 34 | 35 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": True, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": True, 41 | "reduce_scatter": True, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": True, 44 | }, 45 | 46 | "train_micro_batch_size_per_gpu": 4, #32, 47 | "gradient_accumulation_steps": 1, 48 | "data_impl": "mmap", 49 | "num_workers": 1, 50 | 51 | # activation checkpointing 52 | "checkpoint_activations": true, 53 | "checkpoint_num_layers": 1, 54 | "partition_activations": true, 55 | "synchronize_each_layer": true, 56 | 57 | # regularization 58 | "gradient_clipping": 1.0, 59 | "weight_decay": 0.1, 60 | "hidden_dropout": 0, 61 | "attention_dropout": 0, 62 | 63 | # precision settings 64 | "fp16": { 65 | "fp16": true, 66 | "enabled": true, 67 | "loss_scale": 0, 68 | "loss_scale_window": 1000, 69 | "initial_scale_power": 12, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1, 72 | }, 73 | 74 | "train_iters": 143000, 75 | "lr_decay_iters": 143000, 76 | "distributed_backend": "nccl", 77 | "lr_decay_style": "cosine", 78 | "warmup": 0.01, 79 | "checkpoint_factor": 1000, 80 | "eval_interval": 100000, 81 | "eval_iters": 10, 82 | 83 | "log_interval": 10, 84 | "steps_per_print": 10, 85 | "wall_clock_breakdown": true, 86 | 87 | # additional deepspeed args not specified above 88 | "deepspeed_extra_args": { 89 | "comms_logger": { 90 | "enabled": true, 91 | "verbose": true, 92 | "prof_all": true, 93 | "debug": false 94 | }, 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from tools.datasets.corpora import prepare_dataset, DATA_DOWNLOADERS 16 | import argparse 17 | 18 | TOKENIZER_CHOICES = [ 19 | "HFGPT2Tokenizer", 20 | "HFTokenizer", 21 | "GPT2BPETokenizer", 22 | "CharLevelTokenizer", 23 | "TiktokenTokenizer", 24 | "SPMTokenizer", 25 | ] 26 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"] 27 | 28 | 29 | def get_args(): 30 | parser = argparse.ArgumentParser(description="Download & preprocess neox datasets") 31 | parser.add_argument( 32 | "dataset", 33 | nargs="?", 34 | default="enwik8", 35 | help="name of dataset to download.", 36 | choices=DATASET_CHOICES, 37 | ) 38 | parser.add_argument( 39 | "-t", 40 | "--tokenizer", 41 | default="GPT2BPETokenizer", 42 | choices=TOKENIZER_CHOICES, 43 | help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}', 44 | ) 45 | parser.add_argument( 46 | "-d", 47 | "--data-dir", 48 | default=None, 49 | help=f"Directory to which to download datasets / tokenizer " 50 | f"files - defaults to ./data", 51 | ) 52 | parser.add_argument( 53 | "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)" 54 | ) 55 | parser.add_argument( 56 | "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)" 57 | ) 58 | parser.add_argument( 59 | "-f", 60 | "--force-redownload", 61 | dest="force_redownload", 62 | default=False, 63 | action="store_true", 64 | ) 65 | return parser.parse_args() 66 | 67 | 68 | if __name__ == "__main__": 69 | args = get_args() 70 | prepare_dataset( 71 | dataset_name=args.dataset, 72 | tokenizer_type=args.tokenizer, 73 | data_dir=args.data_dir, 74 | vocab_file=args.vocab_file, 75 | merge_file=args.merge_file, 76 | force_redownload=args.force_redownload, 77 | ) 78 | -------------------------------------------------------------------------------- /tools/datasets/merge_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | 6 | sys.path.append( 7 | os.path.abspath( 8 | os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir) 9 | ) 10 | ) 11 | 12 | from megatron.data import indexed_dataset 13 | 14 | 15 | def main(args): 16 | 17 | prefixes = set() 18 | for basename in os.listdir(args.input): 19 | prefix, ext = os.path.splitext(basename) 20 | 21 | if prefix in prefixes: 22 | continue 23 | 24 | if not os.path.isfile(os.path.join(args.input, basename)): 25 | continue 26 | 27 | ext_pair = ".bin" if ext == ".idx" else ".idx" 28 | assert os.path.isfile( 29 | os.path.join(args.input, prefix) + ext_pair 30 | ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}" 31 | 32 | prefixes.add(prefix) 33 | 34 | builder = None 35 | for prefix in sorted(prefixes): 36 | if builder is None: 37 | dataset = indexed_dataset.make_dataset( 38 | os.path.join(args.input, prefix), "infer" 39 | ) 40 | 41 | if isinstance(dataset, indexed_dataset.MMapIndexedDataset): 42 | builder = indexed_dataset.MMapIndexedDatasetBuilder( 43 | args.output_prefix + ".bin", dtype=dataset._index.dtype 44 | ) 45 | else: 46 | builder = indexed_dataset.IndexedDatasetBuilder( 47 | args.output_prefix + ".bin" 48 | ) 49 | 50 | del dataset 51 | 52 | builder.merge_file_(os.path.join(args.input, prefix)) 53 | 54 | builder.finalize(args.output_prefix + ".idx") 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | 60 | group = parser.add_argument_group(title="input data") 61 | group.add_argument( 62 | "--input", 63 | type=str, 64 | required=True, 65 | help="Path to directory containing all document files to merge", 66 | ) 67 | 68 | group = parser.add_argument_group(title="output data") 69 | group.add_argument( 70 | "--output-prefix", 71 | type=str, 72 | required=True, 73 | help="Path to binary output file without suffix", 74 | ) 75 | 76 | args = parser.parse_args() 77 | 78 | assert os.path.isdir( 79 | args.input 80 | ), f"ERROR: {args.input} is not a directory or does not exist" 81 | 82 | assert os.path.isdir( 83 | os.path.dirname(args.output_prefix) 84 | ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist" 85 | 86 | main(args) 87 | -------------------------------------------------------------------------------- /configs/pythia/31M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings 3 | "pipe-parallel-size": 0, 4 | "model-parallel-size": 1, 5 | 6 | # model settings 7 | "num-layers": 6, 8 | "hidden-size": 256, 9 | "num-attention-heads": 8, 10 | "seq-length": 2048, 11 | "max-position-embeddings": 2048, 12 | "pos-emb": "rotary", 13 | "rotary-pct": 0.25, 14 | "no-weight-tying": true, 15 | "gpt-j-residual": true, 16 | "output-layer-parallelism": "column", 17 | 18 | "attention-config": [[["flash"], 6]], 19 | 20 | "scaled-upper-triang-masked-softmax-fusion": true, 21 | "bias-gelu-fusion": true, 22 | 23 | # init methods 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.0001, 36 | 37 | "zero_optimization": { 38 | "stage": 0, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # batch size (trained on 32 gpus) 49 | "train_micro_batch_size_per_gpu": 32, 50 | "data-impl": "mmap", 51 | "num_workers": 2, 52 | 53 | # activation checkpointing 54 | "checkpoint-activations": false, 55 | "checkpoint-num-layers": 1, 56 | "partition-activations": false, 57 | "synchronize-each-layer": true, 58 | 59 | # regularization 60 | "gradient_clipping": 1.0, 61 | "weight-decay": 0.1, 62 | "hidden-dropout": 0, 63 | "attention-dropout": 0, 64 | 65 | # precision settings 66 | "fp16": { 67 | "fp16": true, 68 | "enabled": true, 69 | "loss_scale": 0, 70 | "loss_scale_window": 1000, 71 | "initial_scale_power": 12, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train-iters": 143000, 77 | "lr-decay-iters": 143000, 78 | "distributed-backend": "nccl", 79 | "lr-decay-style": "cosine", 80 | "warmup": 0.01, 81 | "checkpoint-factor": 1000, 82 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512], 83 | "eval-interval": 100000, 84 | "eval-iters": 10, 85 | "log-interval": 10, 86 | "steps_per_print": 10, 87 | "wall_clock_breakdown": true, 88 | 89 | "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"], 90 | "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"], 91 | "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"], 92 | 93 | "tokenizer-type": "HFTokenizer", 94 | "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json" 95 | 96 | } 97 | -------------------------------------------------------------------------------- /configs/pythia/14M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings 3 | "pipe-parallel-size": 0, 4 | "model-parallel-size": 1, 5 | 6 | # model settings 7 | "num-layers": 6, 8 | "hidden-size": 128, 9 | "num-attention-heads": 4, 10 | "seq-length": 2048, 11 | "max-position-embeddings": 2048, 12 | "pos-emb": "rotary", 13 | "rotary-pct": 0.25, 14 | "no-weight-tying": true, 15 | "gpt-j-residual": true, 16 | "output-layer-parallelism": "column", 17 | 18 | "attention-config": [[["flash"], 6]], 19 | 20 | "scaled-upper-triang-masked-softmax-fusion": true, 21 | "bias-gelu-fusion": true, 22 | 23 | # init methods 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.0001, 36 | 37 | "zero_optimization": { 38 | "stage": 0, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 50000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 50000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # batch size (trained on 32 gpus) 49 | "train_micro_batch_size_per_gpu": 32, 50 | "data-impl": "mmap", 51 | "num_workers": 4, 52 | 53 | # activation checkpointing 54 | "checkpoint-activations": false, #true, 55 | "checkpoint-num-layers": 1, 56 | "partition-activations": false, #true, 57 | "synchronize-each-layer": true, 58 | 59 | # regularization 60 | "gradient_clipping": 1.0, 61 | "weight-decay": 0.1, 62 | "hidden-dropout": 0, 63 | "attention-dropout": 0, 64 | 65 | # precision settings 66 | "fp16": { 67 | "fp16": true, 68 | "enabled": true, 69 | "loss_scale": 0, 70 | "loss_scale_window": 1000, 71 | "initial_scale_power": 12, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train-iters": 143000, 77 | "lr-decay-iters": 143000, 78 | "distributed-backend": "nccl", 79 | "lr-decay-style": "cosine", 80 | "warmup": 0.01, 81 | "checkpoint-factor": 1000, 82 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512], 83 | "eval-interval": 100000, 84 | "eval-iters": 10, 85 | 86 | "log-interval": 10, 87 | "steps_per_print": 10, 88 | "wall_clock_breakdown": true, 89 | 90 | "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"], 91 | "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"], 92 | "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"], 93 | 94 | "tokenizer-type": "HFTokenizer", 95 | "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json" 96 | 97 | } 98 | -------------------------------------------------------------------------------- /configs/175B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 96, 10 | "hidden_size": 12288, 11 | "num_attention_heads": 96, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.00006, 35 | "betas": [0.9, 0.95], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | "min_lr": 0.000006, 40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 41 | "zero_optimization": { 42 | "stage": 1, 43 | "allgather_partitions": True, 44 | "allgather_bucket_size": 500000000, 45 | "overlap_comm": True, 46 | "reduce_scatter": True, 47 | "reduce_bucket_size": 500000000, 48 | "contiguous_gradients": True, 49 | }, 50 | 51 | # batch / data settings 52 | "train_micro_batch_size_per_gpu": 4, 53 | "data_impl": "mmap", 54 | 55 | # activation checkpointing 56 | "checkpoint_activations": true, 57 | "checkpoint_num_layers": 1, 58 | "partition_activations": true, 59 | "synchronize_each_layer": true, 60 | 61 | # regularization 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | # precision settings 68 | "fp16": { 69 | "fp16": true, 70 | "enabled": true, 71 | "loss_scale": 0, 72 | "loss_scale_window": 1000, 73 | "hysteresis": 2, 74 | "min_loss_scale": 1 75 | }, 76 | 77 | # misc. training settings 78 | "train_iters": 320000, 79 | "lr_decay_iters": 320000, 80 | "distributed_backend": "nccl", 81 | "lr_decay_style": "cosine", 82 | "warmup": 0.01, 83 | "checkpoint_factor": 10000, 84 | "eval_interval": 1000, 85 | "eval_iters": 10, 86 | 87 | # logging 88 | "log_interval": 100, 89 | "steps_per_print": 10, 90 | "keep_last_n_checkpoints": 4, 91 | "wall_clock_breakdown": true, 92 | } 93 | -------------------------------------------------------------------------------- /configs/350M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 24, 10 | "hidden_size": 1024, 11 | "num_attention_heads": 16, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.0003, 35 | "betas": [0.9, 0.95], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | "min_lr": 0.00003, 40 | 41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 42 | "zero_optimization": { 43 | "stage": 1, 44 | "allgather_partitions": True, 45 | "allgather_bucket_size": 500000000, 46 | "overlap_comm": True, 47 | "reduce_scatter": True, 48 | "reduce_bucket_size": 500000000, 49 | "contiguous_gradients": True, 50 | }, 51 | # batch / data settings 52 | "train_micro_batch_size_per_gpu": 4, 53 | "data_impl": "mmap", 54 | 55 | # activation checkpointing 56 | "checkpoint_activations": true, 57 | "checkpoint_num_layers": 1, 58 | "partition_activations": true, 59 | "synchronize_each_layer": true, 60 | 61 | # regularization 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | # precision settings 68 | "fp16": { 69 | "fp16": true, 70 | "enabled": true, 71 | "loss_scale": 0, 72 | "loss_scale_window": 1000, 73 | "hysteresis": 2, 74 | "min_loss_scale": 1 75 | }, 76 | 77 | # misc. training settings 78 | "train_iters": 320000, 79 | "lr_decay_iters": 320000, 80 | "distributed_backend": "nccl", 81 | "lr_decay_style": "cosine", 82 | "warmup": 0.01, 83 | "checkpoint_factor": 10000, 84 | "eval_interval": 1000, 85 | "eval_iters": 10, 86 | 87 | # logging 88 | "log_interval": 100, 89 | "steps_per_print": 10, 90 | "keep_last_n_checkpoints": 4, 91 | "wall_clock_breakdown": true, 92 | } 93 | -------------------------------------------------------------------------------- /configs/1-3B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 24, 10 | "hidden_size": 2048, 11 | "num_attention_heads": 16, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.0002, 35 | "betas": [0.9, 0.95], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | "min_lr": 0.00002, 40 | 41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 42 | "zero_optimization": { 43 | "stage": 1, 44 | "allgather_partitions": True, 45 | "allgather_bucket_size": 500000000, 46 | "overlap_comm": True, 47 | "reduce_scatter": True, 48 | "reduce_bucket_size": 500000000, 49 | "contiguous_gradients": True, 50 | }, 51 | 52 | # batch / data settings 53 | "train_micro_batch_size_per_gpu": 4, 54 | "data_impl": "mmap", 55 | 56 | # activation checkpointing 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | # regularization 63 | "gradient_clipping": 1.0, 64 | "weight_decay": 0.1, 65 | "hidden_dropout": 0, 66 | "attention_dropout": 0, 67 | 68 | # precision settings 69 | "fp16": { 70 | "fp16": true, 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train_iters": 320000, 80 | "lr_decay_iters": 320000, 81 | "distributed_backend": "nccl", 82 | "lr_decay_style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint_factor": 10000, 85 | "eval_interval": 1000, 86 | "eval_iters": 10, 87 | 88 | # logging 89 | "log_interval": 100, 90 | "steps_per_print": 10, 91 | "keep_last_n_checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | } 94 | -------------------------------------------------------------------------------- /configs/2-7B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 32, 10 | "hidden_size": 2560, 11 | "num_attention_heads": 32, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.00016, 35 | "betas": [0.9, 0.95], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | "min_lr": 0.000016, 40 | 41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 42 | "zero_optimization": { 43 | "stage": 1, 44 | "allgather_partitions": True, 45 | "allgather_bucket_size": 500000000, 46 | "overlap_comm": True, 47 | "reduce_scatter": True, 48 | "reduce_bucket_size": 500000000, 49 | "contiguous_gradients": True, 50 | }, 51 | 52 | # batch / data settings 53 | "train_micro_batch_size_per_gpu": 4, 54 | "data_impl": "mmap", 55 | 56 | # activation checkpointing 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | # regularization 63 | "gradient_clipping": 1.0, 64 | "weight_decay": 0.1, 65 | "hidden_dropout": 0, 66 | "attention_dropout": 0, 67 | 68 | # precision settings 69 | "fp16": { 70 | "fp16": true, 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train_iters": 320000, 80 | "lr_decay_iters": 320000, 81 | "distributed_backend": "nccl", 82 | "lr_decay_style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint_factor": 10000, 85 | "eval_interval": 1000, 86 | "eval_iters": 10, 87 | 88 | # logging 89 | "log_interval": 100, 90 | "steps_per_print": 10, 91 | "keep_last_n_checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | } 94 | -------------------------------------------------------------------------------- /configs/6-7B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 32, 10 | "hidden_size": 4096, 11 | "num_attention_heads": 32, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.00012, 35 | "betas": [0.9, 0.95], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | 40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 41 | "zero_optimization": { 42 | "stage": 1, 43 | "allgather_partitions": True, 44 | "allgather_bucket_size": 500000000, 45 | "overlap_comm": True, 46 | "reduce_scatter": True, 47 | "reduce_bucket_size": 500000000, 48 | "contiguous_gradients": True, 49 | }, 50 | "min_lr": 0.000012, 51 | 52 | # batch / data settings 53 | "train_micro_batch_size_per_gpu": 4, 54 | "data_impl": "mmap", 55 | 56 | # activation checkpointing 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | # regularization 63 | "gradient_clipping": 1.0, 64 | "weight_decay": 0.1, 65 | "hidden_dropout": 0, 66 | "attention_dropout": 0, 67 | 68 | # precision settings 69 | "fp16": { 70 | "fp16": true, 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train_iters": 320000, 80 | "lr_decay_iters": 320000, 81 | "distributed_backend": "nccl", 82 | "lr_decay_style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint_factor": 10000, 85 | "eval_interval": 1000, 86 | "eval_iters": 10, 87 | 88 | # logging 89 | "log_interval": 100, 90 | "steps_per_print": 10, 91 | "keep_last_n_checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | } 94 | -------------------------------------------------------------------------------- /configs/13B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 40, 10 | "hidden_size": 5120, 11 | "num_attention_heads": 40, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | 31 | # optimizer settings 32 | "optimizer": { 33 | "type": "Adam", 34 | "params": { 35 | "lr": 0.0001, 36 | "betas": [0.9, 0.95], 37 | "eps": 1.0e-8, 38 | } 39 | }, 40 | 41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 42 | "zero_optimization": { 43 | "stage": 1, 44 | "allgather_partitions": True, 45 | "allgather_bucket_size": 500000000, 46 | "overlap_comm": True, 47 | "reduce_scatter": True, 48 | "reduce_bucket_size": 500000000, 49 | "contiguous_gradients": True, 50 | }, 51 | "min_lr": 0.00001, 52 | 53 | # batch / data settings 54 | "train_micro_batch_size_per_gpu": 4, 55 | "data_impl": "mmap", 56 | 57 | # activation checkpointing 58 | "checkpoint_activations": true, 59 | "checkpoint_num_layers": 1, 60 | "partition_activations": true, 61 | "synchronize_each_layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight_decay": 0.1, 66 | "hidden_dropout": 0, 67 | "attention_dropout": 0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "fp16": true, 72 | "enabled": true, 73 | "loss_scale": 0, 74 | "loss_scale_window": 1000, 75 | "hysteresis": 2, 76 | "min_loss_scale": 1 77 | }, 78 | 79 | # misc. training settings 80 | "train_iters": 320000, 81 | "lr_decay_iters": 320000, 82 | "distributed_backend": "nccl", 83 | "lr_decay_style": "cosine", 84 | "warmup": 0.01, 85 | "checkpoint_factor": 10000, 86 | "eval_interval": 1000, 87 | "eval_iters": 10, 88 | 89 | # logging 90 | "log_interval": 100, 91 | "steps_per_print": 10, 92 | "keep_last_n_checkpoints": 4, 93 | "wall_clock_breakdown": true, 94 | } 95 | -------------------------------------------------------------------------------- /configs/760M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 24, 10 | "hidden_size": 1536, 11 | "num_attention_heads": 16, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.00025, 35 | "betas": [0.9, 0.999], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | "min_lr": 0.000025, 40 | 41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 42 | "zero_optimization": { 43 | "stage": 1, 44 | "allgather_partitions": True, 45 | "allgather_bucket_size": 500000000, 46 | "overlap_comm": True, 47 | "reduce_scatter": True, 48 | "reduce_bucket_size": 500000000, 49 | "contiguous_gradients": True, 50 | }, 51 | 52 | # batch / data settings 53 | "train_micro_batch_size_per_gpu": 4, 54 | "data_impl": "mmap", 55 | 56 | # activation checkpointing 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | # regularization 63 | "gradient_clipping": 1.0, 64 | "weight_decay": 0.1, 65 | "hidden_dropout": 0, 66 | "attention_dropout": 0, 67 | 68 | # precision settings 69 | "fp16": { 70 | "fp16": true, 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train_iters": 320000, 80 | "lr_decay_iters": 320000, 81 | "distributed_backend": "nccl", 82 | "lr_decay_style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint_factor": 10000, 85 | "eval_interval": 1000, 86 | "eval_iters": 10, 87 | 88 | # logging 89 | "log_interval": 100, 90 | "steps_per_print": 10, 91 | "keep_last_n_checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | } 94 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Model parallel utility interface.""" 16 | 17 | from .cross_entropy import vocab_parallel_cross_entropy 18 | 19 | from .data import broadcast_data 20 | 21 | from .initialize import is_unitialized 22 | from .initialize import destroy_model_parallel 23 | from .initialize import get_data_parallel_group 24 | from .initialize import get_data_parallel_rank 25 | from .initialize import get_data_parallel_world_size 26 | from .initialize import get_model_parallel_group 27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank 29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 30 | from .initialize import get_topology 31 | from .initialize import get_pipe_parallel_group 32 | from .initialize import get_pipe_parallel_rank 33 | from .initialize import get_pipe_parallel_world_size 34 | from .initialize import get_tensor_model_parallel_group 35 | from .initialize import get_tensor_model_parallel_rank 36 | from .initialize import get_tensor_model_parallel_world_size 37 | from .initialize import get_io_parallel_group 38 | from .initialize import initialize_model_parallel 39 | from .initialize import model_parallel_is_initialized 40 | 41 | from .layers import ColumnParallelLinear 42 | from .layers import RowParallelLinear 43 | from .layers import VocabParallelEmbedding 44 | from .layers import ParallelRelativePositionBias 45 | 46 | from .mappings import copy_to_model_parallel_region 47 | from .mappings import gather_from_model_parallel_region 48 | from .mappings import reduce_from_model_parallel_region 49 | from .mappings import scatter_to_model_parallel_region 50 | from .mappings import reduce_scatter_to_sequence_parallel_region 51 | from .mappings import gather_from_sequence_parallel_region 52 | from .mappings import scatter_to_sequence_parallel_region 53 | 54 | from .random import checkpoint 55 | from .random import get_cuda_rng_tracker 56 | from .random import model_parallel_cuda_manual_seed 57 | 58 | from .utils import divide 59 | from .utils import split_tensor_along_last_dim 60 | -------------------------------------------------------------------------------- /configs/125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 12, 10 | "hidden_size": 768, 11 | "num_attention_heads": 12, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled_upper_triang_masked_softmax_fusion": false, 22 | "bias_gelu_fusion": false, 23 | "rope_fusion": false, 24 | "layernorm_fusion": false, 25 | 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | 31 | # optimizer settings 32 | "optimizer": { 33 | "type": "Adam", 34 | "params": { 35 | "lr": 0.0006, 36 | "betas": [0.9, 0.95], 37 | "eps": 1.0e-8, 38 | } 39 | }, 40 | "min_lr": 0.00006, 41 | 42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 43 | "zero_optimization": { 44 | "stage": 1, 45 | "allgather_partitions": True, 46 | "allgather_bucket_size": 500000000, 47 | "overlap_comm": True, 48 | "reduce_scatter": True, 49 | "reduce_bucket_size": 500000000, 50 | "contiguous_gradients": True, 51 | }, 52 | 53 | # batch / data settings 54 | "train_micro_batch_size_per_gpu": 4, 55 | "data_impl": "mmap", 56 | 57 | # activation checkpointing 58 | "checkpoint_activations": true, 59 | "checkpoint_num_layers": 1, 60 | "partition_activations": true, 61 | "synchronize_each_layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight_decay": 0.1, 66 | "hidden_dropout": 0.0, 67 | "attention_dropout": 0.0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train_iters": 320000, 80 | "lr_decay_iters": 320000, 81 | "distributed_backend": "nccl", 82 | "lr_decay_style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint_factor": 10000, 85 | "eval_interval": 1000, 86 | "eval_iters": 10, 87 | 88 | # logging 89 | "log_interval": 100, 90 | "steps_per_print": 10, 91 | "keep_last_n_checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | 94 | # networking 95 | "hostfile": "/mock_path" 96 | } 97 | -------------------------------------------------------------------------------- /configs/125M/512/125M_cope.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "cope", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | 29 | # optimizer settings 30 | "optimizer": { 31 | "type": "Adam", 32 | "params": { 33 | "lr": 0.0006, 34 | "betas": [0.9, 0.95], 35 | "eps": 1.0e-8, 36 | } 37 | }, 38 | "min_lr": 0.00006, 39 | 40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 41 | "zero_optimization": { 42 | "stage": 1, 43 | "allgather_partitions": True, 44 | "allgather_bucket_size": 500000000, 45 | "overlap_comm": True, 46 | "reduce_scatter": True, 47 | "reduce_bucket_size": 500000000, 48 | "contiguous_gradients": True, 49 | }, 50 | 51 | # batch / data settings 52 | "train_micro_batch_size_per_gpu": 32, 53 | "data-impl": "mmap", 54 | 55 | # activation checkpointing 56 | "checkpoint-activations": true, 57 | "checkpoint-num-layers": 1, 58 | "partition-activations": true, 59 | "synchronize-each-layer": true, 60 | 61 | # regularization 62 | "gradient_clipping": 1.0, 63 | "weight-decay": 0.1, 64 | "hidden-dropout": 0.0, 65 | "attention-dropout": 0.0, 66 | 67 | # precision settings 68 | "fp16": { 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | # misc. training settings 77 | "train-iters": 50000, 78 | "lr-decay-iters": 50000, 79 | "distributed-backend": "nccl", 80 | "lr-decay-style": "cosine", 81 | "warmup": 0.01, 82 | "checkpoint-factor": 10000, 83 | "eval-interval": 5000, 84 | "eval-iters": 20, 85 | 86 | # logging 87 | "log-interval": 100, 88 | "steps_per_print": 10, 89 | "keep-last-n-checkpoints": 4, 90 | "wall_clock_breakdown": true, 91 | 92 | # networking 93 | "hostfile": "/mock_path", 94 | "save": "checkpoints/125M_cope", 95 | "load": "checkpoints/125M_cope", 96 | "tensorboard-dir": "tensorboard/125M_cope", 97 | "log-dir": "logs/125M_cope", 98 | } 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # wandb logs 132 | wandb/ 133 | 134 | # data files 135 | data/**/*.idx 136 | data/**/*.bin 137 | data/**/*.json* 138 | data/**/*.txt 139 | data/**/*.gz 140 | data/**/*.zip 141 | data/**/*.np* 142 | data/**/*.npy 143 | checkpoints/ 144 | .vscode/ 145 | *.pt 146 | *.ckpt 147 | 148 | #test logs 149 | test_checkpoint/ 150 | test_logs/ 151 | logs/ 152 | tensorboard/ 153 | src/ 154 | 155 | # test data files 156 | tests/data/*.bin 157 | tests/data/*.idx 158 | -------------------------------------------------------------------------------- /configs/rwkv/170M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # Parallelism is not yet supported for rwkv 3 | "pipe_parallel_size": 1, 4 | "model_parallel_size": 1, 5 | 6 | "num_layers": 12, 7 | "hidden_size": 768, 8 | "num_attention_heads": 12, # head_size = dim_att / num_attention_heads. 9 | # head_size is 64 for all rwkv models 10 | "seq_length": 512, 11 | "max_position_embeddings": 2048, 12 | "output_layer_parallelism": "column", 13 | "norm": "rmsnorm", 14 | "rms_norm_epsilon": 1.0e-5, 15 | "train_micro_batch_size_per_gpu": 32, 16 | 17 | "attention_config": [[["rwkv"], 12]], 18 | 19 | "activation": "silu", 20 | 21 | # model settings 22 | 23 | #"pos_emb": "rotary", 24 | "rotary_pct": 0.25, 25 | "no_weight_tying": true, 26 | "gpt_j_residual": true, 27 | 28 | # these should provide some speedup but takes a while to build, set to true if desired 29 | "scaled_upper_triang_masked_softmax_fusion": false, 30 | "bias_gelu_fusion": false, 31 | "rope_fusion": false, 32 | "layernorm_fusion": false, 33 | 34 | 35 | # init methods 36 | "init_method": "small_init", 37 | "output_layer_init_method": "wang_init", 38 | 39 | # optimizer settings 40 | "optimizer": { 41 | "type": "Adam", 42 | "params": { 43 | "lr": 0.0008, 44 | "betas": [0.9, 0.95], 45 | "eps": 1.0e-8, 46 | } 47 | }, 48 | "min_lr": 0.00008, 49 | 50 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 51 | "zero_optimization": { 52 | "stage": 1, 53 | "allgather_partitions": True, 54 | "allgather_bucket_size": 500000000, 55 | "overlap_comm": True, 56 | "reduce_scatter": True, 57 | "reduce_bucket_size": 500000000, 58 | "contiguous_gradients": True, 59 | }, 60 | 61 | # batch / data settings 62 | "data_impl": "mmap", 63 | "num_workers": 1, 64 | 65 | # activation checkpointing 66 | "checkpoint_activations": true, 67 | "checkpoint_num_layers": 1, 68 | "partition_activations": true, 69 | "synchronize_each_layer": true, 70 | 71 | # regularization 72 | "gradient_clipping": 1.0, 73 | "weight_decay": 0.1, 74 | "hidden_dropout": 0, 75 | "attention_dropout": 0, 76 | 77 | # precision settings 78 | "bf16": { 79 | "bf16": true, 80 | "enabled": true, 81 | "loss_scale": 0, 82 | "loss_scale_window": 1000, 83 | "initial_scale_power": 12, 84 | "hysteresis": 2, 85 | "min_loss_scale": 1, 86 | }, 87 | 88 | # misc. training settings 89 | "train_iters": 500, 90 | "lr_decay_iters": 500, 91 | "distributed_backend": "nccl", 92 | "lr_decay_style": "constant", 93 | "warmup": 0.01, 94 | "checkpoint_factor": 100, 95 | "eval_interval": 100000, 96 | "eval_iters": 10, 97 | 98 | # logging 99 | "log_interval": 10, 100 | "steps_per_print": 10, 101 | "wall_clock_breakdown": true, 102 | } 103 | -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Blendable dataset.""" 19 | 20 | import time 21 | 22 | import numpy as np 23 | import torch 24 | 25 | from megatron import print_rank_0 26 | from megatron import mpu 27 | 28 | 29 | class BlendableDataset(torch.utils.data.Dataset): 30 | def __init__(self, datasets, weights): 31 | self.datasets = datasets 32 | num_datasets = len(datasets) 33 | assert num_datasets == len(weights) 34 | 35 | self.size = 0 36 | for dataset in self.datasets: 37 | self.size += len(dataset) 38 | 39 | # Normalize weights. 40 | weights = np.array(weights, dtype=np.float64) 41 | sum_weights = np.sum(weights) 42 | assert sum_weights > 0.0 43 | weights /= sum_weights 44 | 45 | # Build indices. 46 | start_time = time.time() 47 | assert num_datasets < 255 48 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 49 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 50 | 51 | from megatron.data import helpers 52 | 53 | helpers.build_blending_indices( 54 | self.dataset_index, 55 | self.dataset_sample_index, 56 | weights, 57 | num_datasets, 58 | self.size, 59 | torch.distributed.get_rank() == 0, 60 | ) 61 | 62 | print( 63 | "> RANK {} elapsed time for building blendable dataset indices: " 64 | "{:.2f} (sec)".format( 65 | torch.distributed.get_rank(), time.time() - start_time 66 | ) 67 | ) 68 | 69 | def __len__(self): 70 | return self.size 71 | 72 | def __getitem__(self, idx): 73 | try: 74 | dataset_idx = self.dataset_index[idx] 75 | sample_idx = self.dataset_sample_index[idx] 76 | return self.datasets[dataset_idx][sample_idx] 77 | except IndexError: 78 | new_idx = idx % len(self) 79 | print( 80 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" 81 | ) 82 | return self[new_idx] 83 | -------------------------------------------------------------------------------- /megatron/model/rwkv/v6/cuda/wkv6_op.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ATen/ATen.h" 3 | typedef at::BFloat16 bf16; 4 | 5 | void cuda_forward(int B, 6 | int T, 7 | int C, 8 | int H, 9 | bf16* r, 10 | bf16* k, 11 | bf16* v, 12 | float* w, 13 | bf16* u, 14 | bf16* y); 15 | void cuda_backward(int B, 16 | int T, 17 | int C, 18 | int H, 19 | bf16* r, 20 | bf16* k, 21 | bf16* v, 22 | float* w, 23 | bf16* u, 24 | bf16* gy, 25 | bf16* gr, 26 | bf16* gk, 27 | bf16* gv, 28 | bf16* gw, 29 | bf16* gu); 30 | 31 | void forward(int64_t B, 32 | int64_t T, 33 | int64_t C, 34 | int64_t H, 35 | torch::Tensor& r, 36 | torch::Tensor& k, 37 | torch::Tensor& v, 38 | torch::Tensor& w, 39 | torch::Tensor& u, 40 | torch::Tensor& y) 41 | { 42 | cuda_forward(B, 43 | T, 44 | C, 45 | H, 46 | r.data_ptr(), 47 | k.data_ptr(), 48 | v.data_ptr(), 49 | w.data_ptr(), 50 | u.data_ptr(), 51 | y.data_ptr()); 52 | } 53 | void backward(int64_t B, 54 | int64_t T, 55 | int64_t C, 56 | int64_t H, 57 | torch::Tensor& r, 58 | torch::Tensor& k, 59 | torch::Tensor& v, 60 | torch::Tensor& w, 61 | torch::Tensor& u, 62 | torch::Tensor& gy, 63 | torch::Tensor& gr, 64 | torch::Tensor& gk, 65 | torch::Tensor& gv, 66 | torch::Tensor& gw, 67 | torch::Tensor& gu) 68 | { 69 | cuda_backward(B, 70 | T, 71 | C, 72 | H, 73 | r.data_ptr(), 74 | k.data_ptr(), 75 | v.data_ptr(), 76 | w.data_ptr(), 77 | u.data_ptr(), 78 | gy.data_ptr(), 79 | gr.data_ptr(), 80 | gk.data_ptr(), 81 | gv.data_ptr(), 82 | gw.data_ptr(), 83 | gu.data_ptr()); 84 | } 85 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 86 | { 87 | m.def("forward", &forward, "wkv6 forward"); 88 | m.def("backward", &backward, "wkv6 backward"); 89 | } 90 | 91 | TORCH_LIBRARY(wkv6, m) 92 | { 93 | m.def("forward", forward); 94 | m.def("backward", backward); 95 | } 96 | -------------------------------------------------------------------------------- /configs/125M/512/125M_fire.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 3 | # across the node boundaries ) 4 | "pipe-parallel-size": 1, 5 | "model-parallel-size": 1, 6 | 7 | # model settings 8 | "num-layers": 12, 9 | "hidden-size": 768, 10 | "num-attention-heads": 12, 11 | "seq-length": 512, 12 | "max-position-embeddings": 2048, 13 | "norm": "layernorm", 14 | "pos-emb": "fire", 15 | "no-weight-tying": true, 16 | "gpt_j_residual": false, 17 | "output_layer_parallelism": "column", 18 | 19 | # these should provide some speedup but takes a while to build, set to true if desired 20 | "scaled-upper-triang-masked-softmax-fusion": false, 21 | "bias-gelu-fusion": false, 22 | 23 | "mlp_width": 32, 24 | "noise_seq_length": 128, 25 | # init methods 26 | "init_method": "small_init", 27 | "output_layer_init_method": "wang_init", 28 | 29 | 30 | # optimizer settings 31 | "optimizer": { 32 | "type": "Adam", 33 | "params": { 34 | "lr": 0.0006, 35 | "betas": [0.9, 0.95], 36 | "eps": 1.0e-8, 37 | } 38 | }, 39 | "min_lr": 0.00006, 40 | 41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 42 | "zero_optimization": { 43 | "stage": 1, 44 | "allgather_partitions": True, 45 | "allgather_bucket_size": 500000000, 46 | "overlap_comm": True, 47 | "reduce_scatter": True, 48 | "reduce_bucket_size": 500000000, 49 | "contiguous_gradients": True, 50 | }, 51 | 52 | # batch / data settings 53 | "train_micro_batch_size_per_gpu": 32, 54 | "data-impl": "mmap", 55 | 56 | # activation checkpointing 57 | "checkpoint-activations": true, 58 | "checkpoint-num-layers": 1, 59 | "partition-activations": true, 60 | "synchronize-each-layer": true, 61 | 62 | # regularization 63 | "gradient_clipping": 1.0, 64 | "weight-decay": 0.1, 65 | "hidden-dropout": 0.0, 66 | "attention-dropout": 0.0, 67 | 68 | # precision settings 69 | "fp16": { 70 | "enabled": true, 71 | "loss_scale": 0, 72 | "loss_scale_window": 1000, 73 | "hysteresis": 2, 74 | "min_loss_scale": 1 75 | }, 76 | 77 | # misc. training settings 78 | "train-iters": 50000, 79 | "lr-decay-iters": 50000, 80 | "distributed-backend": "nccl", 81 | "lr-decay-style": "cosine", 82 | "warmup": 0.01, 83 | "checkpoint-factor": 10000, 84 | "eval-interval": 5000, 85 | "eval-iters": 20, 86 | 87 | # logging 88 | "log-interval": 100, 89 | "steps_per_print": 10, 90 | "keep-last-n-checkpoints": 4, 91 | "wall_clock_breakdown": true, 92 | 93 | # networking 94 | "hostfile": "/mock_path", 95 | "save": "checkpoints", 96 | "load": "checkpoints", 97 | "tensorboard-dir": "tensorboard", 98 | "log-dir": "logs", 99 | } 100 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor); 26 | 27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads, 28 | torch::Tensor const& softmax_results, 29 | float scale_factor); 30 | 31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) 32 | { 33 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 34 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 35 | (input.scalar_type() == at::ScalarType::BFloat16), 36 | "Only fp16 and bf16 are supported"); 37 | 38 | return fwd_cuda(input, scale_factor); 39 | } 40 | 41 | torch::Tensor bwd(torch::Tensor const& output_grads, 42 | torch::Tensor const& softmax_results, 43 | float scale_factor) 44 | { 45 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 46 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 47 | 48 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 49 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 50 | "Only fp16 and bf16 are supported"); 51 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 52 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | } // end namespace scaled_upper_triang_masked_softmax 59 | } // end namespace fused_softmax 60 | } // end namespace multihead_attn 61 | 62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 63 | { 64 | m.def("forward", 65 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 66 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 67 | m.def("backward", 68 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 69 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 70 | } 71 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_usage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | plausibility check for the usage of neox_args in the megatron codebase 17 | """ 18 | import pytest 19 | import re 20 | from ..common import get_root_directory 21 | 22 | 23 | @pytest.mark.cpu 24 | def test_neoxargs_usage(): 25 | """ " 26 | checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs 27 | """ 28 | from megatron.neox_arguments import NeoXArgs 29 | 30 | declared_all = True 31 | neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys()) 32 | 33 | # we exclude a number of properties (implemented with the @property decorator) or functions that we know exists 34 | exclude = set( 35 | [ 36 | "params_dtype", 37 | "deepspeed_config", 38 | "get", 39 | "pop", 40 | "get_deepspeed_main_args", 41 | 'optimizer["params"]', 42 | "attention_config[layer_number]", 43 | "adlr_autoresume_object", 44 | "update_value", 45 | "all_config", 46 | "tensorboard_writer", 47 | "tokenizer", 48 | "train_batch_size]", 49 | "items", 50 | "configure_distributed_args", 51 | "build_tokenizer", 52 | "attention_config[i]", 53 | "print", 54 | "update", 55 | ] 56 | ) 57 | 58 | # test file by file 59 | for filename in (get_root_directory() / "megatron").glob("**/*.py"): 60 | if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]: 61 | continue 62 | 63 | # load file 64 | with open(filename, "r") as f: 65 | file_contents = f.read() 66 | 67 | # find args matches 68 | matches = list( 69 | re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents) 70 | ) 71 | if len(matches) == 0: 72 | continue 73 | 74 | # compare 75 | for match in matches: 76 | if match not in neox_args_attributes and match not in exclude: 77 | print( 78 | f"(arguments used not found in neox args): {filename.name}: {match}", 79 | flush=True, 80 | ) 81 | declared_all = False 82 | 83 | assert declared_all, "all arguments used in code defined in NeoXArgs" 84 | -------------------------------------------------------------------------------- /configs/125M/512/125M_alibi.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "alibi", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | "mlp_width": 32, 25 | "noise_seq_length": 128, 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | 31 | # optimizer settings 32 | "optimizer": { 33 | "type": "Adam", 34 | "params": { 35 | "lr": 0.0006, 36 | "betas": [0.9, 0.95], 37 | "eps": 1.0e-8, 38 | } 39 | }, 40 | "min_lr": 0.00006, 41 | 42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 43 | "zero_optimization": { 44 | "stage": 1, 45 | "allgather_partitions": True, 46 | "allgather_bucket_size": 500000000, 47 | "overlap_comm": True, 48 | "reduce_scatter": True, 49 | "reduce_bucket_size": 500000000, 50 | "contiguous_gradients": True, 51 | }, 52 | 53 | # batch / data settings 54 | "train_micro_batch_size_per_gpu": 32, 55 | "data-impl": "mmap", 56 | 57 | # activation checkpointing 58 | "checkpoint-activations": true, 59 | "checkpoint-num-layers": 1, 60 | "partition-activations": true, 61 | "synchronize-each-layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight-decay": 0.1, 66 | "hidden-dropout": 0.0, 67 | "attention-dropout": 0.0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train-iters": 50000, 80 | "lr-decay-iters": 50000, 81 | "distributed-backend": "nccl", 82 | "lr-decay-style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint-factor": 10000, 85 | "eval-interval": 5000, 86 | "eval-iters": 20, 87 | 88 | # logging 89 | "log-interval": 100, 90 | "steps_per_print": 10, 91 | "keep-last-n-checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | 94 | # networking 95 | "hostfile": "/mock_path", 96 | "save": "checkpoints", 97 | "load": "checkpoints", 98 | "tensorboard-dir": "tensorboard", 99 | "log-dir": "logs", 100 | } 101 | -------------------------------------------------------------------------------- /configs/125M/512/125M_alibi_c.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "alibi_c", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | "mlp_width": 32, 25 | "noise_seq_length": 128, 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | 31 | # optimizer settings 32 | "optimizer": { 33 | "type": "Adam", 34 | "params": { 35 | "lr": 0.0006, 36 | "betas": [0.9, 0.95], 37 | "eps": 1.0e-8, 38 | } 39 | }, 40 | "min_lr": 0.00006, 41 | 42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 43 | "zero_optimization": { 44 | "stage": 1, 45 | "allgather_partitions": True, 46 | "allgather_bucket_size": 500000000, 47 | "overlap_comm": True, 48 | "reduce_scatter": True, 49 | "reduce_bucket_size": 500000000, 50 | "contiguous_gradients": True, 51 | }, 52 | 53 | # batch / data settings 54 | "train_micro_batch_size_per_gpu": 32, 55 | "data-impl": "mmap", 56 | 57 | # activation checkpointing 58 | "checkpoint-activations": true, 59 | "checkpoint-num-layers": 1, 60 | "partition-activations": true, 61 | "synchronize-each-layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight-decay": 0.1, 66 | "hidden-dropout": 0.0, 67 | "attention-dropout": 0.0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train-iters": 50000, 80 | "lr-decay-iters": 50000, 81 | "distributed-backend": "nccl", 82 | "lr-decay-style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint-factor": 10000, 85 | "eval-interval": 5000, 86 | "eval-iters": 20, 87 | 88 | # logging 89 | "log-interval": 100, 90 | "steps_per_print": 10, 91 | "keep-last-n-checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | 94 | # networking 95 | "hostfile": "/mock_path", 96 | "save": "checkpoints", 97 | "load": "checkpoints", 98 | "tensorboard-dir": "tensorboard", 99 | "log-dir": "logs", 100 | } 101 | -------------------------------------------------------------------------------- /configs/125M/512/125M_kerple.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "kerple", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | "mlp_width": 32, 25 | "noise_seq_length": 128, 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | 31 | # optimizer settings 32 | "optimizer": { 33 | "type": "Adam", 34 | "params": { 35 | "lr": 0.0006, 36 | "betas": [0.9, 0.95], 37 | "eps": 1.0e-8, 38 | } 39 | }, 40 | "min_lr": 0.00006, 41 | 42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 43 | "zero_optimization": { 44 | "stage": 1, 45 | "allgather_partitions": True, 46 | "allgather_bucket_size": 500000000, 47 | "overlap_comm": True, 48 | "reduce_scatter": True, 49 | "reduce_bucket_size": 500000000, 50 | "contiguous_gradients": True, 51 | }, 52 | 53 | # batch / data settings 54 | "train_micro_batch_size_per_gpu": 32, 55 | "data-impl": "mmap", 56 | 57 | # activation checkpointing 58 | "checkpoint-activations": true, 59 | "checkpoint-num-layers": 1, 60 | "partition-activations": true, 61 | "synchronize-each-layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight-decay": 0.1, 66 | "hidden-dropout": 0.0, 67 | "attention-dropout": 0.0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train-iters": 50000, 80 | "lr-decay-iters": 50000, 81 | "distributed-backend": "nccl", 82 | "lr-decay-style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint-factor": 5000, 85 | "eval-interval": 5000, 86 | "eval-iters": 20, 87 | 88 | # logging 89 | "log-interval": 100, 90 | "steps_per_print": 10, 91 | "keep-last-n-checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | 94 | # networking 95 | "hostfile": "/mock_path", 96 | "save": "checkpoints", 97 | "load": "checkpoints", 98 | "tensorboard-dir": "tensorboard", 99 | "log-dir": "logs", 100 | } 101 | -------------------------------------------------------------------------------- /configs/125M/512/125M_fire_c.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "fire_c", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | "mlp_width": 32, 25 | "noise_seq_length": 128, 26 | # init methods 27 | "init_method": "small_init", 28 | "output_layer_init_method": "wang_init", 29 | 30 | 31 | # optimizer settings 32 | "optimizer": { 33 | "type": "Adam", 34 | "params": { 35 | "lr": 0.0006, 36 | "betas": [0.9, 0.95], 37 | "eps": 1.0e-8, 38 | } 39 | }, 40 | "min_lr": 0.00006, 41 | 42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 43 | "zero_optimization": { 44 | "stage": 1, 45 | "allgather_partitions": True, 46 | "allgather_bucket_size": 500000000, 47 | "overlap_comm": True, 48 | "reduce_scatter": True, 49 | "reduce_bucket_size": 500000000, 50 | "contiguous_gradients": True, 51 | }, 52 | 53 | # batch / data settings 54 | "train_micro_batch_size_per_gpu": 32, 55 | "data-impl": "mmap", 56 | 57 | # activation checkpointing 58 | "checkpoint-activations": true, 59 | "checkpoint-num-layers": 1, 60 | "partition-activations": true, 61 | "synchronize-each-layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight-decay": 0.1, 66 | "hidden-dropout": 0.0, 67 | "attention-dropout": 0.0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train-iters": 50000, 80 | "lr-decay-iters": 50000, 81 | "distributed-backend": "nccl", 82 | "lr-decay-style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint-factor": 10000, 85 | "eval-interval": 5000, 86 | "eval-iters": 20, 87 | 88 | # logging 89 | "log-interval": 100, 90 | "steps_per_print": 10, 91 | "keep-last-n-checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | 94 | # networking 95 | "hostfile": "/mock_path", 96 | "save": "checkpoints", 97 | "load": "checkpoints", 98 | "tensorboard-dir": "tensorboard", 99 | "log-dir": "logs", 100 | } 101 | -------------------------------------------------------------------------------- /configs/125M-moe.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # See README for MoE config docs! 4 | "moe_type": "deepspeed", 5 | "moe_token_dropping": true, 6 | # Have 4 experts per layer (every 2 layers by default) 7 | "moe_num_experts": 4, 8 | # parallelism settings 9 | "enable_expert_tensor_parallelism": true, 10 | "pipe_parallel_size": 1, # not yet supported for MoE 11 | "model_parallel_size": 1, 12 | "moe_expert_parallel_size": 1, 13 | 14 | # model settings 15 | "num_layers": 12, 16 | "hidden_size": 768, 17 | "num_attention_heads": 12, 18 | "seq_length": 2048, 19 | "max_position_embeddings": 2048, 20 | "norm": "layernorm", 21 | "pos_emb": "rotary", 22 | "no_weight_tying": true, 23 | "gpt_j_residual": false, 24 | "output_layer_parallelism": "column", 25 | 26 | # these should provide some speedup but takes a while to build, set to true if desired 27 | "scaled_upper_triang_masked_softmax_fusion": false, 28 | "bias_gelu_fusion": false, 29 | "rope_fusion": false, 30 | 31 | # init methods 32 | "init_method": "small_init", 33 | "output_layer_init_method": "wang_init", 34 | 35 | 36 | # optimizer settings 37 | "optimizer": { 38 | "type": "Adam", 39 | "params": { 40 | "lr": 0.0006, 41 | "betas": [0.9, 0.95], 42 | "eps": 1.0e-8, 43 | } 44 | }, 45 | "min_lr": 0.00006, 46 | 47 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 48 | "zero_optimization": { 49 | "stage": 1, 50 | "allgather_partitions": True, 51 | "allgather_bucket_size": 500000000, 52 | "overlap_comm": True, 53 | "reduce_scatter": True, 54 | "reduce_bucket_size": 500000000, 55 | "contiguous_gradients": True, 56 | }, 57 | 58 | # batch / data settings 59 | "train_micro_batch_size_per_gpu": 4, 60 | "data_impl": "mmap", 61 | 62 | # activation checkpointing 63 | "checkpoint_activations": true, 64 | "checkpoint_num_layers": 1, 65 | "partition_activations": true, 66 | "synchronize_each_layer": true, 67 | 68 | # regularization 69 | "gradient_clipping": 1.0, 70 | "weight_decay": 0.1, 71 | "hidden_dropout": 0.0, 72 | "attention_dropout": 0.0, 73 | 74 | # precision settings 75 | "fp16": { 76 | "enabled": true, 77 | "loss_scale": 0, 78 | "loss_scale_window": 1000, 79 | "hysteresis": 2, 80 | "min_loss_scale": 1 81 | }, 82 | 83 | # misc. training settings 84 | "train_iters": 320000, 85 | "lr_decay_iters": 320000, 86 | "distributed_backend": "nccl", 87 | "lr_decay_style": "cosine", 88 | "warmup": 0.01, 89 | "checkpoint_factor": 10000, 90 | "eval_interval": 1000, 91 | "eval_iters": 10, 92 | 93 | # logging 94 | "log_interval": 10, 95 | "steps_per_print": 10, 96 | "keep_last_n_checkpoints": 4, 97 | "wall_clock_breakdown": true, 98 | 99 | # networking 100 | "hostfile": "/mock_path" 101 | } 102 | -------------------------------------------------------------------------------- /configs/125M-dmoe.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # See README for MoE config docs! 4 | "moe_type": "megablocks", 5 | "moe_token_dropping": false, 6 | # Have 4 experts per layer (every 2 layers by default) 7 | "moe_num_experts": 4, 8 | # parallelism settings 9 | "enable_expert_tensor_parallelism": true, 10 | "pipe_parallel_size": 1, # not yet supported for MoE 11 | "model_parallel_size": 1, 12 | "moe_expert_parallel_size": 1, 13 | 14 | # model settings 15 | "num_layers": 12, 16 | "hidden_size": 768, 17 | "num_attention_heads": 12, 18 | "seq_length": 2048, 19 | "max_position_embeddings": 2048, 20 | "norm": "layernorm", 21 | "pos_emb": "rotary", 22 | "no_weight_tying": true, 23 | "gpt_j_residual": false, 24 | "output_layer_parallelism": "column", 25 | 26 | # these should provide some speedup but takes a while to build, set to true if desired 27 | "scaled_upper_triang_masked_softmax_fusion": false, 28 | "bias_gelu_fusion": false, 29 | "rope_fusion": false, 30 | 31 | # init methods 32 | "init_method": "small_init", 33 | "output_layer_init_method": "wang_init", 34 | 35 | 36 | # optimizer settings 37 | "optimizer": { 38 | "type": "Adam", 39 | "params": { 40 | "lr": 0.0006, 41 | "betas": [0.9, 0.95], 42 | "eps": 1.0e-8, 43 | } 44 | }, 45 | "min_lr": 0.00006, 46 | 47 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 48 | "zero_optimization": { 49 | "stage": 0, 50 | "allgather_partitions": True, 51 | "allgather_bucket_size": 500000000, 52 | "overlap_comm": True, 53 | "reduce_scatter": True, 54 | "reduce_bucket_size": 500000000, 55 | "contiguous_gradients": True, 56 | }, 57 | 58 | # batch / data settings 59 | "train_micro_batch_size_per_gpu": 4, 60 | "data_impl": "mmap", 61 | 62 | # activation checkpointing 63 | "checkpoint_activations": true, 64 | "checkpoint_num_layers": 1, 65 | "partition_activations": true, 66 | "synchronize_each_layer": true, 67 | 68 | # regularization 69 | "gradient_clipping": 1.0, 70 | "weight_decay": 0.1, 71 | "hidden_dropout": 0.0, 72 | "attention_dropout": 0.0, 73 | 74 | # precision settings 75 | "fp16": { 76 | "enabled": true, 77 | "loss_scale": 0, 78 | "loss_scale_window": 1000, 79 | "hysteresis": 2, 80 | "min_loss_scale": 1 81 | }, 82 | 83 | # misc. training settings 84 | "train_iters": 320000, 85 | "lr_decay_iters": 320000, 86 | "distributed_backend": "nccl", 87 | "lr_decay_style": "cosine", 88 | "warmup": 0.01, 89 | "checkpoint_factor": 10000, 90 | "eval_interval": 1000, 91 | "eval_iters": 10, 92 | 93 | # logging 94 | "log_interval": 10, 95 | "steps_per_print": 10, 96 | "keep_last_n_checkpoints": 4, 97 | "wall_clock_breakdown": true, 98 | 99 | # networking 100 | "hostfile": "/mock_path" 101 | } 102 | -------------------------------------------------------------------------------- /configs/125M/512/125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | 29 | # optimizer settings 30 | "optimizer": { 31 | "type": "Adam", 32 | "params": { 33 | "lr": 0.0006, 34 | "betas": [0.9, 0.95], 35 | "eps": 1.0e-8, 36 | } 37 | }, 38 | "min_lr": 0.00006, 39 | 40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 41 | "zero_optimization": { 42 | "stage": 1, 43 | "allgather_partitions": True, 44 | "allgather_bucket_size": 500000000, 45 | "overlap_comm": True, 46 | "reduce_scatter": True, 47 | "reduce_bucket_size": 500000000, 48 | "contiguous_gradients": True, 49 | }, 50 | 51 | # batch / data settings 52 | "train_micro_batch_size_per_gpu": 32, 53 | "data-impl": "mmap", 54 | 55 | "mlp_width": 32, 56 | 57 | # activation checkpointing 58 | "checkpoint-activations": true, 59 | "checkpoint-num-layers": 1, 60 | "partition-activations": true, 61 | "synchronize-each-layer": true, 62 | 63 | # regularization 64 | "gradient_clipping": 1.0, 65 | "weight-decay": 0.1, 66 | "hidden-dropout": 0.0, 67 | "attention-dropout": 0.0, 68 | 69 | # precision settings 70 | "fp16": { 71 | "enabled": true, 72 | "loss_scale": 0, 73 | "loss_scale_window": 1000, 74 | "hysteresis": 2, 75 | "min_loss_scale": 1 76 | }, 77 | 78 | # misc. training settings 79 | "train-iters": 50000, 80 | "lr-decay-iters": 50000, 81 | "distributed-backend": "nccl", 82 | "lr-decay-style": "cosine", 83 | "warmup": 0.01, 84 | "checkpoint-factor": 10000, 85 | "eval-interval": 5000, 86 | "eval-iters": 20, 87 | 88 | # logging 89 | "log-interval": 100, 90 | "steps_per_print": 10, 91 | "keep-last-n-checkpoints": 4, 92 | "wall_clock_breakdown": true, 93 | 94 | # networking 95 | "hostfile": "/mock_path", 96 | "save": "checkpoints/125M/none_c_ffn", 97 | "load": "checkpoints/125M/none_c_ffn", 98 | "tensorboard-dir": "tensorboard/125M/none_c_ffn", 99 | "log-dir": "logs/125M/none_c_ffn", 100 | } 101 | -------------------------------------------------------------------------------- /configs/125M/512/125M_fire_capev2.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 512, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "capev2", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | "mlp_width": 32, 25 | "capev2_kernel": 3, 26 | "noise_seq_length": 512, 27 | # init methods 28 | "init_method": "small_init", 29 | "output_layer_init_method": "wang_init", 30 | 31 | 32 | # optimizer settings 33 | "optimizer": { 34 | "type": "Adam", 35 | "params": { 36 | "lr": 0.0006, 37 | "betas": [0.9, 0.95], 38 | "eps": 1.0e-8, 39 | } 40 | }, 41 | "min_lr": 0.00006, 42 | 43 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 44 | "zero_optimization": { 45 | "stage": 1, 46 | "allgather_partitions": True, 47 | "allgather_bucket_size": 500000000, 48 | "overlap_comm": True, 49 | "reduce_scatter": True, 50 | "reduce_bucket_size": 500000000, 51 | "contiguous_gradients": True, 52 | }, 53 | 54 | # batch / data settings 55 | "train_micro_batch_size_per_gpu": 32, 56 | "data-impl": "mmap", 57 | 58 | # activation checkpointing 59 | "checkpoint-activations": true, 60 | "checkpoint-num-layers": 1, 61 | "partition-activations": true, 62 | "synchronize-each-layer": true, 63 | 64 | # regularization 65 | "gradient_clipping": 1.0, 66 | "weight-decay": 0.1, 67 | "hidden-dropout": 0.0, 68 | "attention-dropout": 0.0, 69 | 70 | # precision settings 71 | "fp16": { 72 | "enabled": true, 73 | "loss_scale": 0, 74 | "loss_scale_window": 1000, 75 | "hysteresis": 2, 76 | "min_loss_scale": 1 77 | }, 78 | 79 | # misc. training settings 80 | "train-iters": 50000, 81 | "lr-decay-iters": 50000, 82 | "distributed-backend": "nccl", 83 | "lr-decay-style": "cosine", 84 | "warmup": 0.01, 85 | "checkpoint-factor": 10000, 86 | "eval-interval": 5000, 87 | "eval-iters": 20, 88 | 89 | # logging 90 | "log-interval": 100, 91 | "steps_per_print": 10, 92 | "keep-last-n-checkpoints": 4, 93 | "wall_clock_breakdown": true, 94 | 95 | # networking 96 | "hostfile": "/mock_path", 97 | "save": "checkpoints", 98 | "load": "checkpoints", 99 | "tensorboard-dir": "tensorboard", 100 | "log-dir": "logs", 101 | } 102 | --------------------------------------------------------------------------------