├── tests
├── __init__.py
├── unit
│ ├── __init__.py
│ ├── test_dependencies.py
│ ├── test_tokenizer.py
│ ├── test_url_accessibility.py
│ ├── test_format_conversion_scripts.py
│ └── test_arguments.py
├── data
│ └── sample_prompt.txt
├── neox_args
│ ├── __init__.py
│ ├── test_neoxargs_implementation.py
│ └── test_neoxargs_usage.py
├── cpu_tests
│ └── docker-compose.yml
├── model
│ └── __init__.py
├── pytest.ini
└── config
│ └── test_setup.yml
├── tools
├── __init__.py
├── bash
│ ├── kill.sh
│ ├── killall.sh
│ ├── README.md
│ ├── sync_cmd.sh
│ ├── sync.sh
│ └── syncdir.sh
├── README.md
├── datasets
│ ├── dataset_token_count.py
│ ├── multinode_prepare_data.sh
│ └── merge_datasets.py
└── ckpts
│ └── upload.py
├── megatron
├── model
│ ├── rwkv
│ │ ├── __init__.py
│ │ └── v6
│ │ │ ├── __init__.py
│ │ │ └── cuda
│ │ │ └── wkv6_op.cpp
│ ├── mamba
│ │ └── __init__.py
│ ├── megablocks_utils.py
│ ├── __init__.py
│ └── fused_bias_dropout.py
├── data
│ ├── __init__.py
│ ├── Makefile
│ └── blendable_dataset.py
├── gradient_noise_scale
│ └── __init__.py
├── tokenizer
│ └── __init__.py
├── fused_kernels
│ ├── compat.h
│ └── scaled_upper_triang_masked_softmax.cpp
├── __init__.py
├── devutil.py
├── mpu
│ ├── random.py
│ └── __init__.py
└── neox_arguments
│ └── template.py
├── .dockerignore
├── requirements
├── requirements-apex-pip.txt
├── requirements-comet.txt
├── requirements-wandb.txt
├── requirements-sparseattention.txt
├── requirements-flashattention.txt
├── requirements-onebitadam.txt
├── requirements-s3.txt
├── requirements-tensorboard.txt
├── requirements-transformerengine.txt
├── requirements-mamba.txt
├── requirements-dev.txt
└── requirements.txt
├── MANIFEST.in
├── images
├── memory_profiling.png
├── nsight_profiling.png
└── pytorch_profiling.png
├── .idea
├── misc.xml
├── vcs.xml
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── DAPE.iml
├── configs
├── cpu_mock_config.yml
├── slurm_local.json
├── slurm_local.yml
├── prof.yml
├── docker
│ └── pythia-paths.yml
├── text_generation.yml
├── sparse.yml
├── llama
│ ├── 13B.yml
│ ├── 30B.yml
│ ├── 65B.yml
│ ├── 7B.yml
│ ├── README.md
│ └── train_config.yml
├── llama2
│ ├── 13B.yml
│ ├── 7B.yml
│ ├── 70B.yml
│ ├── codellama_7B.yml
│ └── codellama_34B.yml
├── local_setup.yml
├── eleutherai_cluster.yml
├── local_setup_wandb.yml
├── mistral
│ └── 7B.yml
├── local_setup_comet.yml
├── slurm_125M.yml
├── 125M-json.yml
├── autotuning_configs
│ ├── tune_6-7B.json
│ ├── tune.json
│ ├── small_tune.json
│ └── tune_1-3B.json
├── gmlp_small.yml
├── pythia
│ ├── 70M.yml
│ ├── 160M.yml
│ ├── 1-4B.yml
│ ├── 410M.yml
│ ├── 6-9B.yml
│ ├── 12B.yml
│ ├── 1B.yml
│ ├── 2-8B.yml
│ ├── 31M.yml
│ └── 14M.yml
├── 800M.yml
├── finetuning_configs
│ └── 6-9B.yml
├── bf16_125M.yml
├── mamba
│ ├── mamba-130M.yml
│ ├── mamba-370M.yml
│ ├── mamba-1.4B.yml
│ ├── mamba-2.8B.yml
│ └── mamba-790M.yml
├── 49M.yml
├── bnb_125M.yml
├── 19M.yml
├── 175B.yml
├── 350M.yml
├── 1-3B.yml
├── 2-7B.yml
├── 6-7B.yml
├── 13B.yml
├── 760M.yml
├── 125M.yml
├── 125M
│ └── 512
│ │ ├── 125M_cope.yml
│ │ ├── 125M_fire.yml
│ │ ├── 125M_alibi.yml
│ │ ├── 125M_alibi_c.yml
│ │ ├── 125M_kerple.yml
│ │ ├── 125M_fire_c.yml
│ │ ├── 125M.yml
│ │ └── 125M_fire_capev2.yml
├── rwkv
│ └── 170M.yml
├── 125M-moe.yml
└── 125M-dmoe.yml
├── docker-compose-dockerhub.yml
├── eval_tasks
└── __init__.py
├── docker-compose.yml
├── deepy.py
├── train.py
├── .pre-commit-config.yaml
├── README-MUP.md
├── post-training
├── llama_data.py
└── recreating_zephyr_dpo.md
├── prepare_data.py
└── .gitignore
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/megatron/model/rwkv/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | 20B_checkpoints/
2 |
--------------------------------------------------------------------------------
/tools/bash/kill.sh:
--------------------------------------------------------------------------------
1 | pkill -9 python
2 |
--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 |
--------------------------------------------------------------------------------
/requirements/requirements-apex-pip.txt:
--------------------------------------------------------------------------------
1 | pip==23.3.2
2 |
--------------------------------------------------------------------------------
/requirements/requirements-comet.txt:
--------------------------------------------------------------------------------
1 | comet_ml>=3.45.0
2 |
--------------------------------------------------------------------------------
/requirements/requirements-wandb.txt:
--------------------------------------------------------------------------------
1 | wandb>=0.10.28
2 |
--------------------------------------------------------------------------------
/requirements/requirements-sparseattention.txt:
--------------------------------------------------------------------------------
1 | triton==2.1.0
2 |
--------------------------------------------------------------------------------
/tests/data/sample_prompt.txt:
--------------------------------------------------------------------------------
1 | Hello, I'm a language model
2 |
--------------------------------------------------------------------------------
/requirements/requirements-flashattention.txt:
--------------------------------------------------------------------------------
1 | flash-attn==2.5.6
2 |
--------------------------------------------------------------------------------
/requirements/requirements-onebitadam.txt:
--------------------------------------------------------------------------------
1 | cupy-cuda111>=8.6.0
2 |
--------------------------------------------------------------------------------
/requirements/requirements-s3.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | hf-transfer>=0.1.3
3 |
--------------------------------------------------------------------------------
/requirements/requirements-tensorboard.txt:
--------------------------------------------------------------------------------
1 | tensorboard==2.13.0
2 |
--------------------------------------------------------------------------------
/tools/bash/killall.sh:
--------------------------------------------------------------------------------
1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 |
--------------------------------------------------------------------------------
/megatron/gradient_noise_scale/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_noise_scale import GradientNoiseScale
2 |
--------------------------------------------------------------------------------
/megatron/model/rwkv/v6/__init__.py:
--------------------------------------------------------------------------------
1 | from .rwkv import RWKVResidualLayerPipe, RWKVResidualLayer
2 |
--------------------------------------------------------------------------------
/images/memory_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/memory_profiling.png
--------------------------------------------------------------------------------
/images/nsight_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/nsight_profiling.png
--------------------------------------------------------------------------------
/images/pytorch_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/pytorch_profiling.png
--------------------------------------------------------------------------------
/requirements/requirements-transformerengine.txt:
--------------------------------------------------------------------------------
1 | pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
2 |
--------------------------------------------------------------------------------
/tests/neox_args/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | testing of implementation of command line arguments and configuration (NeoXArgs)
3 | """
4 |
--------------------------------------------------------------------------------
/megatron/model/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | from .mamba import (
2 | ParallelMambaResidualLayer,
3 | ParallelMambaResidualLayerPipe,
4 | )
5 |
--------------------------------------------------------------------------------
/requirements/requirements-mamba.txt:
--------------------------------------------------------------------------------
1 | causal_conv1d>=1.1.0
2 | einops
3 | mamba_ssm>=1.2.0.post1 # required for untied embedding + unembedding layers
4 |
--------------------------------------------------------------------------------
/requirements/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8>=1.5.6
2 | clang-format>=13.0.1
3 | pre-commit>=2.17.0
4 | pytest>=6.2.3
5 | pytest-cov>=2.11.1
6 | pytest-forked>=1.3.0
7 | pytest-html==4.1.1
8 | pytest-xdist
9 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/configs/cpu_mock_config.yml:
--------------------------------------------------------------------------------
1 | # CPU unit tests should be independent of the presence of GPUs on the test server
2 | # host. This configuration mocks these GPU resources and other dependencies.
3 | {
4 | "global_num_gpus": 1
5 | }
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/tests/unit/test_dependencies.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from megatron import fused_kernels
3 |
4 |
5 | def test_fused_kernels():
6 | pytest.xfail(reason="Fused kernels require manual intervention to install")
7 | fused_kernels.load_fused_kernels()
8 |
--------------------------------------------------------------------------------
/configs/slurm_local.json:
--------------------------------------------------------------------------------
1 | {
2 | "vocab-file": "data/gpt2-vocab.json",
3 | "merge-file": "data/gpt2-merges.txt",
4 | "save": "checkpoints",
5 | "checkpoint_validation_with_forward_pass": false,
6 | "tensorboard-dir": "tensorboard",
7 | "log-dir": "logs"
8 | }
9 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/configs/slurm_local.yml:
--------------------------------------------------------------------------------
1 | {
2 | "data_path": "data/enwik8/enwik8_text_document",
3 | "vocab_file": "data/gpt2-vocab.json",
4 | "merge_file": "data/gpt2-merges.txt",
5 | "save": "checkpoints",
6 | "checkpoint_validation_with_forward_pass": false,
7 | "tensorboard_dir": "tensorboard",
8 | "log_dir": "logs",
9 | }
10 |
--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
3 | LIBNAME = helpers
4 | LIBEXT = $(shell python3-config --extension-suffix)
5 |
6 | default: $(LIBNAME)$(LIBEXT)
7 |
8 | %$(LIBEXT): %.cpp
9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 |
--------------------------------------------------------------------------------
/tests/unit/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from megatron.tokenizer import train_tokenizer
3 |
4 |
5 | @pytest.mark.cpu
6 | def test_train_tokenizer():
7 | input_args = [
8 | "--json_input_dir",
9 | "./tests/data/enwik8_first100.txt",
10 | "--tokenizer_output_path",
11 | "",
12 | ]
13 | args = train_tokenizer.parse_args(input_args)
14 | train_tokenizer.main(args)
15 |
--------------------------------------------------------------------------------
/configs/prof.yml:
--------------------------------------------------------------------------------
1 | # Sample profiling config
2 | {
3 | # Turns on nsys and pytorch profiling
4 | "profile": true,
5 |
6 | # pytorch profiler options
7 | "profile_step_start": 10,
8 | "profile_step_stop": 12,
9 |
10 | # pytorch memory profiler options
11 | "memory_profiling": true,
12 | "memory_profiling_path": tensorboard,
13 |
14 |
15 | # All trace files (pytorch, nsys, tensorboard, etc) will be written here
16 | "tensorboard_dir": "tensorboard",
17 | }
18 |
--------------------------------------------------------------------------------
/.idea/DAPE.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
2 | ftfy>=6.0.1
3 | huggingface_hub>=0.11.0
4 | jinja2==3.1.4
5 | lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
6 | lm_eval>=0.4.0,<=0.4.1
7 | mpi4py>=3.0.3
8 | numpy<2.0
9 | pybind11>=2.6.2
10 | regex
11 | sentencepiece
12 | six
13 | tiktoken>=0.1.2
14 | tokenizers>=0.12.1
15 | transformers==4.38.0
16 |
--------------------------------------------------------------------------------
/tools/bash/README.md:
--------------------------------------------------------------------------------
1 | # Bash Scripts
2 | Useful for running distributed per-node scripts on e.g. Kubernetes
3 |
4 | * `kill.sh` kills all python processes
5 | * `killall.sh` uses pdsh to kill all `train.py` processes on the nodes listed in `/job/hosts/`
6 | * `sync_cmd.sh` uses pdsh to run a command on all the nodes listed in `/job/hosts/`
7 | * `sync.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
8 | * `syncdir.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
9 |
--------------------------------------------------------------------------------
/configs/docker/pythia-paths.yml:
--------------------------------------------------------------------------------
1 | {
2 | "train-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
3 | "valid-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
4 | "test-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
5 |
6 | "tokenizer-type": "HFTokenizer",
7 | "vocab-file": "/home/mchorse/data/tokenizers/20B_tokenizer.json",
8 |
9 | "save": "/home/mchorse/chk/",
10 | "load": "/home/mchorse/chk/",
11 | "checkpoint_validation_with_forward_pass": False
12 | }
13 |
--------------------------------------------------------------------------------
/configs/text_generation.yml:
--------------------------------------------------------------------------------
1 | # Parameters used for text generation
2 | # Make sure `load` is specified somewhere else
3 | {
4 | # Text gen type: `input-file`, `unconditional` or `interactive`
5 | "text_gen_type": "unconditional",
6 |
7 | # Params for all
8 | "maximum_tokens": 102,
9 | "prompt_end": "\n",
10 | "temperature": 1.0,
11 | "top_p": 0.0,
12 | "top_k": 0,
13 | "recompute": false,
14 |
15 | # `unconditional`: samples
16 | "num_samples": 10,
17 |
18 | # input/output file
19 | "sample_input_file": "sample_input.txt",
20 | "sample_output_file": "sample_output.txt",
21 | }
22 |
--------------------------------------------------------------------------------
/configs/sparse.yml:
--------------------------------------------------------------------------------
1 | # Add this to your config for sparse attention every other layer
2 | {
3 | "attention_config": [[["local", "global"], "all"]],
4 |
5 | # sparsity config:
6 | # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
7 | # illustrative purposes)
8 | # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
9 | # more detailed config instructions and available parameters
10 |
11 | "sparsity_config": {
12 | "block": 16, # block size
13 | "num_local_blocks": 32,
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/tests/cpu_tests/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3' # slightly different to make sure CPU tests run without nvidia device
2 | services:
3 | gpt-neox:
4 | command: nvidia-smi dmon
5 | image: gpt-neox
6 | build:
7 | context: .
8 | dockerfile: Dockerfile
9 | shm_size: 1g
10 | ulimits:
11 | memlock:
12 | soft: -1
13 | hard: -1
14 | logging:
15 | options:
16 | max-size: "100m"
17 | max-file: "3"
18 | volumes:
19 | - ${NEOX_DATA_PATH}:/home/mchorse/data
20 | - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk
21 | - .:/home/mchorse/gpt-neox
22 |
--------------------------------------------------------------------------------
/tests/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/docker-compose-dockerhub.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | gpt-neox:
4 | command: nvidia-smi dmon
5 | image: leogao2/gpt-neox:main
6 | shm_size: 1g
7 | ulimits:
8 | memlock:
9 | soft: -1
10 | hard: -1
11 | runtime: nvidia
12 | deploy:
13 | resources:
14 | reservations:
15 | devices:
16 | - driver: nvidia
17 | capabilities: [gpu]
18 | logging:
19 | options:
20 | max-size: "100m"
21 | max-file: "3"
22 | volumes:
23 | - ${NEOX_DATA_PATH}:/home/mchorse/data
24 | - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk
25 | - .:/home/mchorse/gpt-neox
26 |
--------------------------------------------------------------------------------
/eval_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness
16 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | gpt-neox:
4 | command: nvidia-smi dmon
5 | image: gpt-neox
6 | build:
7 | context: .
8 | dockerfile: Dockerfile
9 | shm_size: 1g
10 | ulimits:
11 | memlock:
12 | soft: -1
13 | hard: -1
14 | runtime: nvidia
15 | deploy:
16 | resources:
17 | reservations:
18 | devices:
19 | - driver: nvidia
20 | capabilities: [gpu]
21 | logging:
22 | options:
23 | max-size: "100m"
24 | max-file: "3"
25 | volumes:
26 | - ${NEOX_DATA_PATH}:/home/mchorse/data
27 | - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk
28 | - .:/home/mchorse/gpt-neox
29 |
--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from .tokenizer import build_tokenizer
17 |
--------------------------------------------------------------------------------
/configs/llama/13B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 2,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 40,
8 | "hidden_size": 5120,
9 | "num_attention_heads": 40,
10 | "seq_length": 2048,
11 | "max_position_embeddings": 2048,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 1,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": false,
16 | "output_layer_parallelism": "column",
17 | "norm": "rmsnorm",
18 | "rms_norm_epsilon": 1.0e-6,
19 |
20 | "scaled_upper_triang_masked_softmax_fusion": true,
21 | "bias_gelu_fusion": false,
22 | "use_bias_in_norms": false,
23 | "use_bias_in_attn_linear": false,
24 | "activation": "swiglu",
25 | "mlp_multiple_of": 256,
26 | }
27 |
--------------------------------------------------------------------------------
/configs/llama/30B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 4,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 60,
8 | "hidden_size": 6656,
9 | "num_attention_heads": 52,
10 | "seq_length": 2048,
11 | "max_position_embeddings": 2048,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 1,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": false,
16 | "output_layer_parallelism": "column",
17 | "norm": "rmsnorm",
18 | "rms_norm_epsilon": 1.0e-6,
19 |
20 | "scaled_upper_triang_masked_softmax_fusion": true,
21 | "bias_gelu_fusion": false,
22 | "use_bias_in_norms": false,
23 | "use_bias_in_attn_linear": false,
24 | "activation": "swiglu",
25 | "mlp_multiple_of": 256,
26 | }
27 |
--------------------------------------------------------------------------------
/configs/llama/65B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 8,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 80,
8 | "hidden_size": 8192,
9 | "num_attention_heads": 64,
10 | "seq_length": 2048,
11 | "max_position_embeddings": 2048,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 1,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": false,
16 | "output_layer_parallelism": "column",
17 | "norm": "rmsnorm",
18 | "rms_norm_epsilon": 1.0e-6,
19 |
20 | "scaled_upper_triang_masked_softmax_fusion": true,
21 | "bias_gelu_fusion": false,
22 | "use_bias_in_norms": false,
23 | "use_bias_in_attn_linear": false,
24 | "activation": "swiglu",
25 | "mlp_multiple_of": 256,
26 | }
27 |
--------------------------------------------------------------------------------
/configs/llama/7B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 32,
8 | "hidden_size": 4096,
9 | "num_attention_heads": 32,
10 | "seq_length": 2048,
11 | "max_position_embeddings": 2048,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 1,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": false,
16 | "output_layer_parallelism": "column",
17 | "norm": "rmsnorm",
18 | "rms_norm_epsilon": 1.0e-6,
19 |
20 | "scaled_upper_triang_masked_softmax_fusion": true,
21 | "bias_gelu_fusion": false,
22 | "use_bias_in_norms": false,
23 | "use_bias_in_attn_linear": false,
24 | "activation": "swiglu",
25 | "mlp_multiple_of": 256,
26 | }
27 |
--------------------------------------------------------------------------------
/configs/llama2/13B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 2,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 40,
8 | "hidden_size": 5120,
9 | "num_attention_heads": 40,
10 | "seq_length": 4096,
11 | "max_position_embeddings": 4096,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 1,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": false,
16 | "output_layer_parallelism": "column",
17 | "norm": "rmsnorm",
18 | "rms_norm_epsilon": 1.0e-5,
19 |
20 | "scaled_upper_triang_masked_softmax_fusion": true,
21 | "bias_gelu_fusion": false,
22 | "use_bias_in_norms": false,
23 | "use_bias_in_attn_linear": false,
24 | "activation": "swiglu",
25 | "mlp_multiple_of": 256,
26 | }
27 |
--------------------------------------------------------------------------------
/configs/llama2/7B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 32,
8 | "hidden_size": 4096,
9 | "num_attention_heads": 32,
10 | "seq_length": 4096,
11 | "max_position_embeddings": 4096,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 1,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": false,
16 | "output_layer_parallelism": "column",
17 | "norm": "rmsnorm",
18 | "rms_norm_epsilon": 1.0e-5,
19 |
20 | "scaled_upper_triang_masked_softmax_fusion": true,
21 | "bias_gelu_fusion": false,
22 | "use_bias_in_norms": false,
23 | "use_bias_in_attn_linear": false,
24 | "activation": "swiglu",
25 | "mlp_multiple_of": 256,
26 | }
27 |
--------------------------------------------------------------------------------
/configs/llama/README.md:
--------------------------------------------------------------------------------
1 | # LLaMA
2 |
3 | ## Training and Finetuning
4 |
5 | These configs contain the architecture settings required to run inference/training/finetuning on the [LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama) model suite.
6 |
7 | LLaMA finetuning can be launched with
8 | ```sh
9 | python ./deepy.py ./train.py -d configs llama/7B.yml llama/train_config.yml local_setup.yml
10 | ```
11 |
12 | If training from scratch, set `finetune=False` in `./configs/llama/train_config.yml`.
13 |
14 |
15 | ## Inference
16 |
17 |
18 | LLaMA generation can be launched with
19 | ```sh
20 | python ./deepy.py ./generate.py -d configs \
21 | llama/7B.yml llama/train_config.yml local_setup.yml text_generation.yml \
22 | -i input_prompt.txt -o prompt_out.txt
23 | ```
24 |
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | # GPT-NeoX Auxiliary Tools
2 |
3 | This directory contains a number of auxiliary tools that are useful for working with GPT-NeoX but not part of the main training code.
4 |
5 | ## Bash
6 |
7 | This directory contains some simple, frequently used bash commands to make working on multiple machines easier.
8 |
9 | ## Checkpoints
10 |
11 | This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x.
12 |
13 | ## Datasets
14 |
15 | This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library.
16 |
--------------------------------------------------------------------------------
/tests/unit/test_url_accessibility.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import requests
3 |
4 | from tools.datasets.corpora import DATA_DOWNLOADERS
5 |
6 |
7 | def check_url_accessible(url):
8 | try:
9 | response = requests.head(url, timeout=5)
10 | response.raise_for_status()
11 | return True
12 | except requests.exceptions.RequestException as e:
13 | print(f"Error: Unable to access URL - {e}")
14 | return False
15 |
16 |
17 | @pytest.mark.cpu
18 | @pytest.mark.parametrize("dataset_name", list(DATA_DOWNLOADERS.keys()))
19 | def test_url_accessibility(dataset_name):
20 | if dataset_name == "pass":
21 | return
22 | elif not dataset_name == "enwik8":
23 | pytest.xfail()
24 | for url in DATA_DOWNLOADERS[dataset_name].urls:
25 | assert check_url_accessible(url)
26 |
--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | [pytest]
16 | markers =
17 | cpu: marks tests that can be run on cpu
18 | filterwarnings =
19 | ignore::DeprecationWarning:pkg_resources.*
20 | ignore::DeprecationWarning:torch.*
21 |
--------------------------------------------------------------------------------
/tools/bash/sync_cmd.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #!/usr/bin/env bash
16 |
17 | # Runs a command in parallel across all nodes
18 | # Usage
19 | # sync_cmd.sh 'echo "hello world"'
20 |
21 | echo "Command: $1";
22 | pdsh -R ssh -w ^/job/hosts $1
23 |
--------------------------------------------------------------------------------
/configs/llama2/70B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 80,
8 | "hidden_size": 8192,
9 | "intermediate_size": 28672,
10 | "num_attention_heads": 64,
11 | "num_kv_heads": 8,
12 | "seq_length": 4096,
13 | "max_position_embeddings": 4096,
14 | "pos_emb": "rotary",
15 | "rotary_pct": 1,
16 | "rotary_emb_base": 1000000,
17 | "no_weight_tying": true,
18 | "gpt_j_residual": false,
19 | "output_layer_parallelism": "column",
20 | "norm": "rmsnorm",
21 | "rms_norm_epsilon": 1.0e-5,
22 |
23 | "attention_config": [[["flash"], 80]],
24 |
25 | "scaled_upper_triang_masked_softmax_fusion": true,
26 | "bias_gelu_fusion": false,
27 | "use_bias_in_norms": false,
28 | "use_bias_in_attn_linear": false,
29 | "activation": "swiglu",
30 | "mlp_multiple_of": 256,
31 | }
32 |
--------------------------------------------------------------------------------
/configs/llama2/codellama_7B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 32,
8 | "hidden_size": 4096,
9 | "num_attention_heads": 32,
10 | # Codellama was uptrained on 16k token sequence lengths
11 | # with rotary_emb_base adjusted to 1_000_000.
12 | "seq_length": 16384,
13 | "max_position_embeddings": 16384,
14 | "pos_emb": "rotary",
15 | "rotary_pct": 1,
16 | "rotary_emb_base": 1000000,
17 | "no_weight_tying": true,
18 | "gpt_j_residual": false,
19 | "output_layer_parallelism": "column",
20 | "norm": "rmsnorm",
21 | "rms_norm_epsilon": 1.0e-5,
22 |
23 | "attention_config": [[["flash"], 32]],
24 |
25 | "scaled_upper_triang_masked_softmax_fusion": true,
26 | "bias_gelu_fusion": false,
27 | "use_bias_in_norms": false,
28 | "use_bias_in_attn_linear": false,
29 | "activation": "swiglu",
30 | "mlp_multiple_of": 256,
31 | }
32 |
--------------------------------------------------------------------------------
/tools/bash/sync.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #!/usr/bin/env bash
16 |
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 |
21 | echo Number of files to upload: $#
22 |
23 | for file in "$@"
24 | do
25 | full_path=$(realpath $file)
26 | echo Uploading $full_path
27 | pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
28 | done
29 |
--------------------------------------------------------------------------------
/configs/llama2/codellama_34B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 48,
8 | "hidden_size": 8192,
9 | "num_attention_heads": 64,
10 | "num_kv_heads": 8,
11 | # Codellama was uptrained on 16k token sequence lengths
12 | # with rotary_emb_base adjusted to 1_000_000.
13 | "seq_length": 16384,
14 | "max_position_embeddings": 16384,
15 | "pos_emb": "rotary",
16 | "rotary_pct": 1,
17 | "rotary_emb_base": 1000000,
18 | "no_weight_tying": true,
19 | "gpt_j_residual": false,
20 | "output_layer_parallelism": "column",
21 | "norm": "rmsnorm",
22 | "rms_norm_epsilon": 1.0e-5,
23 |
24 | "attention_config": [[["flash"], 48]],
25 |
26 | "scaled_upper_triang_masked_softmax_fusion": true,
27 | "bias_gelu_fusion": false,
28 | "use_bias_in_norms": false,
29 | "use_bias_in_attn_linear": false,
30 | "activation": "swiglu",
31 | "mlp_multiple_of": 256,
32 | }
33 |
--------------------------------------------------------------------------------
/megatron/model/megablocks_utils.py:
--------------------------------------------------------------------------------
1 | """Adapter to expose MegaBlocks package, if available."""
2 |
3 | try:
4 | import megablocks
5 | except ImportError:
6 | megablocks = None
7 |
8 |
9 | def megablocks_is_available():
10 | return megablocks is not None
11 |
12 |
13 | def assert_megablocks_is_available():
14 | assert (
15 | megablocks_is_available()
16 | ), "MegaBlocks not available. Please run `pip install megablocks`."
17 |
18 |
19 | moe = megablocks.layers.moe if megablocks_is_available() else None
20 | dmoe = megablocks.layers.dmoe if megablocks_is_available() else None
21 | arguments = megablocks.layers.arguments if megablocks_is_available() else None
22 |
23 |
24 | def as_megablocks_args(neox_args):
25 | import copy
26 |
27 | tmp = copy.copy(neox_args)
28 | args = arguments.from_megatron(tmp)
29 | args.moe_lbl_in_fp32 = True
30 | args.fp16 = neox_args.precision == "fp16"
31 | args.moe_loss_weight = neox_args.moe_loss_coeff
32 | return args
33 |
--------------------------------------------------------------------------------
/tools/datasets/dataset_token_count.py:
--------------------------------------------------------------------------------
1 | # Script counts tokens in a pretokenized dataset from preprocess_data.py
2 | # Necessary for setting batch size, train_iters, etc
3 |
4 | import sys
5 | import os
6 |
7 | ## Necessary for the import
8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
9 | sys.path.insert(0, project_root)
10 |
11 | from megatron.data import indexed_dataset
12 | import numpy as np
13 |
14 | if len(sys.argv) < 2:
15 | print(
16 | "Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..."
17 | )
18 | sys.exit(1)
19 |
20 | # Access the command-line arguments
21 | arguments = sys.argv[1:]
22 |
23 | for arg in arguments:
24 | print("Checking file", arg)
25 | try:
26 | dataset = indexed_dataset.make_dataset(arg, "mmap")
27 | size = np.sum(dataset.sizes)
28 | print("Dataset size in tokens is", size)
29 | except AttributeError:
30 | print("Dataset could not be loaded", arg)
31 |
--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
1 | /* coding=utf-8
2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | /*This code is copied from NVIDIA apex:
18 | * https://github.com/NVIDIA/apex
19 | * with minor changes. */
20 |
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 |
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 |
--------------------------------------------------------------------------------
/tools/bash/syncdir.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #!/usr/bin/env bash
16 |
17 | # Push files to all nodes
18 | # Usage
19 | # syncdir.sh file [file2..]
20 |
21 | echo Number of files to upload: $#
22 |
23 | for file in "$@"
24 | do
25 | full_path=$(realpath $file)
26 | parentdir="$(dirname "$full_path")"
27 | echo Uploading $full_path to $parentdir
28 | pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
29 | done
30 |
--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 |
16 |
17 | def print_rank_0(*message):
18 | """If distributed is initialized print only on rank 0."""
19 | if torch.distributed.is_initialized():
20 | if torch.distributed.get_rank() == 0:
21 | print(*message, flush=True)
22 | else:
23 | print(*message, flush=True)
24 |
25 |
26 | from .neox_arguments import NeoXArgs
27 |
--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2024 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from .gpt2_model import GPT2ModelPipe
19 | from .utils import (
20 | get_params_for_weight_decay_optimization,
21 | mark_norms_for_sequence_parallel_grad_sync,
22 | )
23 | from .word_embeddings import SoftEmbedding
24 |
--------------------------------------------------------------------------------
/tests/unit/test_format_conversion_scripts.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from tools.ckpts import convert_neox_to_hf
3 | from tests.common import simulate_deepy_env, save_random_model
4 | from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
5 |
6 |
7 | def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
8 | # Generate random GPT-NEOX model, check we can convert to hf format
9 | model_dir = str(tmpdir)
10 | input_args = ["train.py", "tests/config/test_setup.yml"]
11 | deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args)
12 | save_random_model(deepspeed_main_args, model_dir, train_iters=1)
13 |
14 | # Generate output
15 | script_args = [
16 | "--config_file",
17 | "tests/config/test_setup.yml",
18 | "--input_dir",
19 | model_dir + "/global_step1",
20 | "--output_dir",
21 | model_dir,
22 | ]
23 | overwrite_values = {"tokenizer_type": NeoXArgsTokenizer.tokenizer_type}
24 | convert_neox_to_hf.main(input_args=script_args, overwrite_values=overwrite_values)
25 |
--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_implementation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | check implementation of NeoXArgs for duplication errors (would overwrite)
17 | """
18 | import pytest
19 |
20 |
21 | @pytest.mark.cpu
22 | def test_neoxargs_duplicates():
23 | """
24 | tests that there are no duplicates among parent classes of NeoXArgs
25 | """
26 | from megatron import NeoXArgs
27 |
28 | assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates"
29 |
--------------------------------------------------------------------------------
/configs/local_setup.yml:
--------------------------------------------------------------------------------
1 | # Suggested data paths when using GPT-NeoX locally
2 | {
3 | "data_path": "data/enwik8/enwik8_text_document",
4 |
5 | # or for weighted datasets:
6 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
7 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
8 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
9 | # "train-data-weights": [1., 2.],
10 | # "test-data-weights": [2., 1.],
11 | # "valid-data-weights": [0.5, 0.4],
12 |
13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 | # WARNING: setting this to True will override any user provided weights
15 | # "weight_by_num_documents": false,
16 | # "weighted_sampler_alpha": 0.3,
17 |
18 | "vocab_file": "data/gpt2-vocab.json",
19 | "merge_file": "data/gpt2-merges.txt",
20 |
21 | "save": "checkpoints",
22 | "load": "checkpoints",
23 | "checkpoint_validation_with_forward_pass": False,
24 |
25 | "tensorboard_dir": "tensorboard",
26 | "log_dir": "logs",
27 | }
28 |
--------------------------------------------------------------------------------
/configs/eleutherai_cluster.yml:
--------------------------------------------------------------------------------
1 | # Data paths and options when using EleutherAI cluster
2 | {
3 | # you may include multiple distinct datasets if desired
4 | "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
5 | "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
6 | "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
7 |
8 | # if using multiple datasets, provide weights for them to be sampled with
9 | # "train-data-weights": [1., 2.],
10 | # "test-data-weights": [2., 1.],
11 | # "valid-data-weights": [0.5, 0.4],
12 |
13 |
14 | # If you would like the code to create val and test datasets from your training set use the following instead
15 | # "split" determines the relative size of train, val, and test
16 |
17 | # "split" 995,4,1
18 | # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
19 |
20 | "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
21 | "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
22 | "save": "/mnt/ssd-1/checkpoints",
23 | "load": "/mnt/ssd-1/checkpoints",
24 | "tensorboard_dir": "/mnt/ssd-1/tensorboard",
25 | "log_dir": "/mnt/ssd-1/logs",
26 | "wandb_team": "eleutherai",
27 | "wandb_project": "neox",
28 | "wandb_group": "example"
29 | }
30 |
--------------------------------------------------------------------------------
/configs/local_setup_wandb.yml:
--------------------------------------------------------------------------------
1 | # Suggested data paths when using GPT-NeoX locally
2 | {
3 | "data_path": "data/enwik8/enwik8_text_document",
4 |
5 | # or for weighted datasets:
6 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
7 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
8 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
9 | # "train-data-weights": [1., 2.],
10 | # "test-data-weights": [2., 1.],
11 | # "valid-data-weights": [0.5, 0.4],
12 |
13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 | # WARNING: setting this to True will override any user provided weights
15 | # "weight_by_num_documents": false,
16 | # "weighted_sampler_alpha": 0.3,
17 |
18 | "vocab_file": "data/gpt2-vocab.json",
19 | "merge_file": "data/gpt2-merges.txt",
20 |
21 | "save": "checkpoints",
22 | "load": "checkpoints",
23 | "checkpoint_validation_with_forward_pass": False,
24 |
25 | "tensorboard_dir": "tensorboard",
26 | "log_dir": "logs",
27 | "use_wandb": True,
28 | "wandb_host": "https://api.wandb.ai",
29 | "wandb_project": "neox"
30 | }
31 |
--------------------------------------------------------------------------------
/megatron/devutil.py:
--------------------------------------------------------------------------------
1 | import torch.cuda
2 |
3 |
4 | class Metric:
5 | """
6 | Dumb utility to collect and report average wall-time metrics.
7 | """
8 |
9 | def __init__(self, label):
10 | self.label = label
11 | self.measurements = []
12 |
13 | def collect(self, measurement):
14 | self.measurements.append(measurement)
15 |
16 | def get_measurements(self):
17 | return self.measurements[:]
18 |
19 | def report(self):
20 | print(
21 | self.label,
22 | torch.quantile(torch.tensor(self.measurements), torch.arange(10) / 10.0),
23 | )
24 |
25 |
26 | def monitor_method_cuda_wall_times(metric, obj, methodname):
27 | """
28 | Measure timings for a method on an object or class.
29 |
30 | For instance:
31 |
32 | >>> metric = Metric('!LNORM')
33 | >>> monitor_method_wall_times(metric, LayerNorm, 'forward')
34 | """
35 | oldmeth = getattr(obj, methodname)
36 |
37 | start_event = torch.cuda.Event(enable_timing=True)
38 | end_event = torch.cuda.Event(enable_timing=True)
39 |
40 | def newmeth(*args, **kw):
41 | start_event.record()
42 | try:
43 | return oldmeth(*args, **kw)
44 | finally:
45 | end_event.record()
46 | torch.cuda.synchronize()
47 | elapsed = start_event.elapsed_time(end_event)
48 | metric.collect(elapsed)
49 | metric.report()
50 |
51 | setattr(obj, methodname, newmeth)
52 |
--------------------------------------------------------------------------------
/configs/mistral/7B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "make_vocab_size_divisible_by": 1,
5 |
6 | # model settings
7 | "num_layers": 32,
8 | "hidden_size": 4096,
9 | "intermediate_size": 14336,
10 | "num_attention_heads": 32,
11 | "num_kv_heads": 8,
12 | # per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen
13 | # and instruction tuned to 16384 seqlen, all with 4096 sliding window
14 | "seq_length": 8192,
15 | "sliding_window_width": 4096,
16 | "max_position_embeddings": 131072,
17 | "pos_emb": "rotary",
18 | "rotary_pct": 1,
19 | "rotary_emb_base": 10000,
20 | "no_weight_tying": true,
21 | "gpt_j_residual": false,
22 | "output_layer_parallelism": "column",
23 | "norm": "rmsnorm",
24 | "rms_norm_epsilon": 1.0e-5,
25 |
26 | # Grouped Query Attention is supported for both default ("global")
27 | # and Flash attention. However, we highly recommend the use of Flash attention
28 | # to get FLOP + runtime speedups when using GQA,
29 | # and sliding window attention is currently only supported by Flash attention.
30 | "attention_config": [[["flash"], 32]],
31 |
32 | "scaled_upper_triang_masked_softmax_fusion": true,
33 | "bias_gelu_fusion": false,
34 | "use_bias_in_norms": false,
35 | "use_bias_in_attn_linear": false,
36 | "activation": "swiglu",
37 |
38 | "tokenizer_type": "SPMTokenizer",
39 | #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/deepy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) 2024, EleutherAI
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import logging
17 | import os
18 |
19 | import deepspeed.launcher.runner
20 |
21 |
22 | def main(input_args=None):
23 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
24 |
25 | from megatron.neox_arguments import NeoXArgs
26 | from megatron.utils import get_wandb_api_key
27 |
28 | neox_args = NeoXArgs.consume_deepy_args(input_args)
29 | deepspeed_main_args = neox_args.get_deepspeed_main_args()
30 |
31 | # Extract wandb API key and inject into worker environments
32 | wandb_token = get_wandb_api_key(neox_args=neox_args)
33 | if wandb_token is not None:
34 | deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35 | os.environ["WANDB_API_KEY"] = wandb_token
36 |
37 | deepspeed.launcher.runner.main(deepspeed_main_args)
38 |
39 |
40 | if __name__ == "__main__":
41 | main()
42 |
--------------------------------------------------------------------------------
/configs/local_setup_comet.yml:
--------------------------------------------------------------------------------
1 | # Suggested data paths when using GPT-NeoX locally
2 | {
3 | "data_path": "/workspace/gpt-neox-main/data/enwik8/enwik8_text_document",
4 |
5 | # or for weighted datasets:
6 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
7 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
8 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
9 | # "train-data-weights": [1., 2.],
10 | # "test-data-weights": [2., 1.],
11 | # "valid-data-weights": [0.5, 0.4],
12 |
13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 | # WARNING: setting this to True will override any user provided weights
15 | # "weight_by_num_documents": false,
16 | # "weighted_sampler_alpha": 0.3,
17 |
18 | "vocab_file": "/workspace/gpt-neox-main/data/gpt2-vocab.json",
19 | "merge_file": "/workspace/gpt-neox-main/data/gpt2-merges.txt",
20 |
21 | "save": "checkpoints",
22 | "load": "checkpoints",
23 | "checkpoint_validation_with_forward_pass": False,
24 |
25 | "tensorboard_dir": "tensorboard",
26 | "log_dir": "logs",
27 | "use_comet": True,
28 | # "comet_workspace": "test_workspace", # CHANGE ME
29 | "comet_project": "test_project",
30 | "comet_experiment_name": "test_experiment",
31 | "comet_tags": ["test_tag1", "test_tag2"],
32 | "comet_others": {"test_others"},
33 | }
34 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | # This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """Train"""
19 | from megatron.neox_arguments import NeoXArgs
20 | from megatron.training import pretrain
21 |
22 |
23 | def main(input_args=None, overwrite_values=None):
24 | neox_args = NeoXArgs.consume_neox_args(
25 | input_args=input_args, overwrite_values=overwrite_values
26 | )
27 | neox_args.configure_distributed_args()
28 | neox_args.build_tokenizer() # tokenizer needs to be build in training in order to set the padding vocab
29 | neox_args.initialize_tensorboard_writer() # is initialized if tensorboard directory is defined
30 | neox_args.initialize_comet() # is initialized if comet directory is defined
31 | pretrain(neox_args=neox_args)
32 |
33 |
34 | if __name__ == "__main__":
35 | main()
36 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.1.0
4 | hooks:
5 | - id: check-case-conflict
6 | - id: check-json
7 | - id: check-symlinks
8 | - id: check-yaml
9 | - id: destroyed-symlinks
10 | - id: end-of-file-fixer
11 | exclude: ^(docs/CNAME/|configs/neox_arguments.md)
12 | - id: fix-byte-order-marker
13 | - id: fix-encoding-pragma
14 | args: [--remove]
15 | - id: mixed-line-ending
16 | args: [--fix=lf]
17 | - id: requirements-txt-fixer
18 | - id: trailing-whitespace
19 | exclude: ^(docs/CNAME/|configs/neox_arguments.md)
20 | - repo: https://gitlab.com/daverona/pre-commit/cpp
21 | rev: 0.8.0
22 | hooks:
23 | - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
24 | args: []
25 |
26 | - repo: https://github.com/psf/black
27 | rev: 22.3.0
28 | hooks:
29 | - id: black
30 | language_version: python3
31 | - repo: https://github.com/codespell-project/codespell
32 | rev: v2.1.0
33 | hooks:
34 | - id: codespell
35 | args: [
36 | '--ignore-words-list=reord,dout,te', # Word used in error messages that need rewording. te --> transformerengine
37 | --check-filenames,
38 | --check-hidden,
39 | ]
40 | exclude: tests/data/hf_cache/tokenizer/gpt2.json
41 |
--------------------------------------------------------------------------------
/megatron/mpu/random.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
16 | # TODO: should be able to get rid of this file entirely
17 |
18 | import deepspeed
19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
20 |
21 | # Default name for the model parallel rng tracker.
22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = (
23 | deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
24 | )
25 |
26 | # Whether apply model parallelsim to checkpointed hidden states.
27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
28 |
29 | # RNG tracker object.
30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
31 |
32 | # Deepspeed checkpointing functions
33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones
34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state
35 | checkpoint = checkpointing.checkpoint
36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
38 |
--------------------------------------------------------------------------------
/tools/ckpts/upload.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import sys
17 |
18 | from huggingface_hub import HfApi, create_repo
19 |
20 | converted_ckpt = sys.argv[1]
21 | repo_name = sys.argv[2]
22 | branch_name = sys.argv[3]
23 | try:
24 | create_repo(repo_name, repo_type="model", private=False)
25 | except:
26 | print("repo {repo_name} already exists!")
27 | pass
28 |
29 | files = os.listdir(converted_ckpt)
30 |
31 | api = HfApi()
32 | if branch_name != "main":
33 | try:
34 | api.create_branch(
35 | repo_id=repo_name,
36 | repo_type="model",
37 | branch=branch_name,
38 | )
39 | except:
40 | print(f"branch {branch_name} already exists, try again...")
41 | print(f"to upload: {files}")
42 | for file in files:
43 | print(f"Uploading {file} to branch {branch_name}...")
44 | api.upload_file(
45 | path_or_fileobj=os.path.join(converted_ckpt, file),
46 | path_in_repo=file,
47 | repo_id=repo_name,
48 | repo_type="model",
49 | commit_message=f"Upload {file}",
50 | revision=branch_name,
51 | )
52 | print(f"Successfully uploaded {file} !")
53 |
--------------------------------------------------------------------------------
/README-MUP.md:
--------------------------------------------------------------------------------
1 | # How to use Mup (https://github.com/microsoft/mup)
2 |
3 | ## Add mup neox args to your config
4 |
5 | ```
6 | # mup
7 |
8 | "use-mup": true,
9 |
10 | "save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank
11 |
12 | "base-shapes-file": "base-shapes", # load base shapes from this file
13 |
14 | "coord-check": false, # generate coord check plots to verify mup's implementation in neox
15 |
16 | # mup hp search
17 |
18 | "mup-init-scale": 1.0,
19 |
20 | "mup-attn-temp": 1.0,
21 |
22 | "mup-output-temp": 1.0,
23 |
24 | "mup-embedding-mult": 1.0,
25 |
26 | "mup-rp-embedding-mult": 1.0,
27 | ```
28 |
29 | ## Generate base shapes
30 |
31 | 1. Set use-mup to true
32 | 2. Set save-base-shapes to true
33 | 3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named .. gpt-neox will exit immediately.
34 | 4. Set save-base-shapes to false
35 |
36 | ## Generate coord check plots (optional)
37 |
38 | 1. Keep use-mup true
39 | 2. Set coord-check to true
40 | 3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately
41 | 4. Set coord-check to false
42 |
43 | ## Tune mup hyperparameters and LR
44 |
45 | The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml).
46 |
47 | ## Transfer
48 |
49 | With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again.
50 |
--------------------------------------------------------------------------------
/tests/unit/test_arguments.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from megatron.neox_arguments import NeoXArgs
16 | from tests.common import BASE_CONFIG, DistributedTest
17 |
18 |
19 | def test_main_constructor():
20 | input_args = ["train.py", "tests/config/test_setup.yml"]
21 | neox_args = NeoXArgs.consume_deepy_args(input_args)
22 | deepspeed_main_args = neox_args.get_deepspeed_main_args()
23 | neox_args = NeoXArgs.consume_neox_args(input_args=deepspeed_main_args)
24 | neox_args.configure_distributed_args()
25 |
26 |
27 | class test_constructor_from_ymls_class(DistributedTest):
28 | world_size = 2
29 |
30 | def test(self):
31 | neox_args = NeoXArgs.from_ymls(["tests/config/test_setup.yml"])
32 | neox_args.configure_distributed_args()
33 |
34 |
35 | def test_constructor_from_ymls():
36 | t1 = test_constructor_from_ymls_class()
37 | t1.test()
38 |
39 |
40 | class test_constructor_from_dict_class(DistributedTest):
41 | world_size = 2
42 |
43 | def test(self):
44 | neox_args = NeoXArgs.from_dict(BASE_CONFIG)
45 |
46 |
47 | def test_constructor_from_dict():
48 | t1 = test_constructor_from_dict_class()
49 | t1.test()
50 |
--------------------------------------------------------------------------------
/post-training/llama_data.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from datasets import load_dataset, DatasetDict
4 |
5 | import jsonlines
6 |
7 | ###############
8 | # Load datasets
9 | ###############
10 | raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
11 | # convert to just train and test, not necessary but it looks better
12 | raw_datasets = DatasetDict(
13 | {
14 | "train": raw_datasets["train_prefs"],
15 | "test": raw_datasets["test_prefs"],
16 | }
17 | )
18 | os.makedirs(os.path.join("data", "pairwise"), exist_ok=True)
19 | for split in ["train", "test"]:
20 | with open(
21 | os.path.join("data", "pairwise", f"llama3_dpo_{split}_filtered.jsonl"), "w"
22 | ) as f:
23 | writer = jsonlines.Writer(f)
24 | for item in raw_datasets[split]:
25 | item["chosen"] = item["chosen"]
26 | item["rejected"] = item["rejected"]
27 | writer.write(item)
28 | os.makedirs(os.path.join("data", "sft"), exist_ok=True)
29 | for split in ["train", "test"]:
30 | with open(
31 | os.path.join("data", "sft", f"llama3_sft_{split}_filtered.jsonl"), "w"
32 | ) as f:
33 | writer = jsonlines.Writer(f)
34 | for item in raw_datasets[split]:
35 | item["messages"] = item["chosen"]
36 | writer.write(item)
37 | os.makedirs(os.path.join("data", "kto"), exist_ok=True)
38 | for split in ["train", "test"]:
39 | with open(
40 | os.path.join("data", "kto", f"llama3_kto_{split}_filtered.jsonl"), "w"
41 | ) as f:
42 | writer = jsonlines.Writer(f)
43 | for item in raw_datasets[split]:
44 | item["messages"] = item["chosen"]
45 | item["reward"] = 1
46 | writer.write(item)
47 | item["messages"] = item["rejected"]
48 | item["reward"] = -1
49 | writer.write(item)
50 |
--------------------------------------------------------------------------------
/megatron/neox_arguments/template.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from dataclasses import dataclass
16 | import logging
17 |
18 |
19 | @dataclass
20 | class NeoXArgsTemplate:
21 | def defaults(self):
22 | """
23 | generator for getting default values.
24 | """
25 | for key, field_def in self.__dataclass_fields__.items():
26 | yield key, field_def.default
27 |
28 | def update_value(self, key: str, value):
29 | """
30 | updates a property value if the key already exists
31 |
32 | Problem: a previously non-existing property can be added to the class instance without error.
33 | """
34 | if hasattr(self, key):
35 | setattr(self, key, value)
36 | else:
37 | error_message = (
38 | self.__class__.__name__
39 | + ".update_value() to be updated property "
40 | + str(key)
41 | + " does not exist"
42 | )
43 | logging.error(error_message)
44 | raise ValueError(error_message)
45 |
46 | def update_values(self, d):
47 | """
48 | Updates multiple values in self if the keys already exists
49 | """
50 | for k, v in d.items():
51 | self.update_value(k, v)
52 |
--------------------------------------------------------------------------------
/configs/slurm_125M.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 | "num_layers": 12,
5 | "hidden_size": 768,
6 | "num_attention_heads": 12,
7 | "seq_length": 2048,
8 | "max_position_embeddings": 2048,
9 | "norm": "layernorm",
10 | "pos_emb": "rotary",
11 | "no_weight_tying": true,
12 | "scaled_upper_triang_masked_softmax_fusion": true,
13 | "bias_gelu_fusion": true,
14 | "rope_fusion": false,
15 | "layernorm_fusion": false,
16 | "optimizer": {
17 | "type": "Adam",
18 | "params": {
19 | "lr": 0.0006,
20 | "betas": [0.9, 0.999],
21 | "eps": 1.0e-8
22 | }
23 | },
24 | "zero_optimization": {
25 | "stage": 0,
26 | "allgather_partitions": true,
27 | "allgather_bucket_size": 500000000,
28 | "overlap_comm": true,
29 | "reduce_scatter": true,
30 | "reduce_bucket_size": 500000000,
31 | "contiguous_gradients": true
32 | },
33 | "train_micro_batch_size_per_gpu": 4,
34 | "data_impl": "mmap",
35 | "split": "949,50,1",
36 | "checkpoint_activations": true,
37 | "checkpoint_num_layers": 1,
38 | "partition_activations": true,
39 | "synchronize_each_layer": true,
40 | "gradient_clipping": 1.0,
41 | "weight_decay": 0.0,
42 | "hidden_dropout": 0.0,
43 | "attention_dropout": 0.0,
44 | "fp16": {
45 | "enabled": true,
46 | "loss_scale": 0,
47 | "loss_scale_window": 1000,
48 | "hysteresis": 2,
49 | "min_loss_scale": 1
50 | },
51 | "train_iters": 320000,
52 | "lr_decay_iters": 320000,
53 | "distributed_backend": "nccl",
54 | "lr_decay_style": "cosine",
55 | "warmup": 0.01,
56 | "checkpoint_factor": 10000,
57 | "eval_interval": 1000,
58 | "eval_iters": 10,
59 | "log_interval": 100,
60 | "steps_per_print": 10,
61 | "keep_last_n_checkpoints": 4,
62 | "wall_clock_breakdown": true,
63 | "launcher": "slurm",
64 | "deepspeed_slurm": true,
65 | "comment": "neox"
66 | }
67 |
--------------------------------------------------------------------------------
/configs/llama/train_config.yml:
--------------------------------------------------------------------------------
1 | {
2 | # finetuning option
3 | "finetune": true,
4 |
5 | # init methods
6 | "init_method": "small_init",
7 | "output_layer_init_method": "wang_init",
8 |
9 | # optimizer settings
10 | "optimizer": {
11 | "type": "Adam",
12 | "params": {
13 | "lr": 0.0002,
14 | "betas": [0.9, 0.95],
15 | "eps": 1.0e-8,
16 | }
17 | },
18 | "min_lr": 0.00002,
19 | "override_lr_scheduler": true,
20 |
21 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
22 | "zero_optimization": {
23 | "stage": 1,
24 | "allgather_partitions": True,
25 | "allgather_bucket_size": 500000000,
26 | "overlap_comm": True,
27 | "reduce_scatter": True,
28 | "reduce_bucket_size": 500000000,
29 | "contiguous_gradients": True,
30 | },
31 |
32 | # batch / data settings
33 | "train_micro_batch_size_per_gpu": 4,
34 | "data_impl": "mmap",
35 |
36 | # activation checkpointing
37 | "checkpoint_activations": true,
38 | "checkpoint_num_layers": 1,
39 | "partition_activations": true,
40 | "synchronize_each_layer": true,
41 |
42 | # regularization
43 | "gradient_clipping": 1.0,
44 | "weight_decay": 0.1,
45 | "hidden_dropout": 0,
46 | "attention_dropout": 0,
47 |
48 | # precision settings
49 | "fp16": {
50 | "fp16": true,
51 | "enabled": true,
52 | "loss_scale": 0,
53 | "loss_scale_window": 1000,
54 | "hysteresis": 2,
55 | "min_loss_scale": 1
56 | },
57 |
58 | # misc. training settings
59 | "train_iters": 320000,
60 | "lr_decay_iters": 320000,
61 | "distributed_backend": "nccl",
62 | "lr_decay_style": "cosine",
63 | "warmup": 0.01,
64 | "checkpoint_factor": 10000,
65 | "eval_interval": 1000,
66 | "eval_iters": 10,
67 |
68 | # logging
69 | "log_interval": 100,
70 | "steps_per_print": 10,
71 | "keep_last_n_checkpoints": 4,
72 | "wall_clock_breakdown": true,
73 | "mlp_multiple_of": 256,
74 | }
75 |
--------------------------------------------------------------------------------
/configs/125M-json.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 12,
6 | "hidden_size": 768,
7 | "num_attention_heads": 12,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "norm": "layernorm",
11 | "pos_emb": "rotary",
12 | "no_weight_tying": true,
13 | "gpt_j_residual": false,
14 | "output_layer_parallelism": "column",
15 |
16 | "scaled_upper_triang_masked_softmax_fusion": false,
17 | "bias_gelu_fusion": false,
18 | "rope_fusion": false,
19 | "layernorm_fusion": false,
20 |
21 | "init_method": "small_init",
22 | "output_layer_init_method": "wang_init",
23 |
24 | "optimizer": {
25 | "type": "Adam",
26 | "params": {
27 | "lr": 0.0006,
28 | "betas": [0.9, 0.95],
29 | "eps": 1.0e-8
30 | }
31 | },
32 | "min_lr": 0.00006,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 500000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 500000000,
41 | "contiguous_gradients": true
42 | },
43 |
44 | "train_micro_batch_size_per_gpu": 4,
45 | "data_impl": "mmap",
46 |
47 | "checkpoint_activations": true,
48 | "checkpoint_num_layers": 1,
49 | "partition_activations": true,
50 | "synchronize_each_layer": true,
51 |
52 | "gradient_clipping": 1.0,
53 | "weight_decay": 0.1,
54 | "hidden_dropout": 0.0,
55 | "attention_dropout": 0.0,
56 |
57 | "fp16": {
58 | "enabled": true,
59 | "loss_scale": 0,
60 | "loss_scale_window": 1000,
61 | "hysteresis": 2,
62 | "min_loss_scale": 1
63 | },
64 |
65 | "train_iters": 320000,
66 | "lr_decay_iters": 320000,
67 | "distributed_backend": "nccl",
68 | "lr_decay_style": "cosine",
69 | "warmup": 0.01,
70 | "checkpoint_factor": 10000,
71 | "eval_interval": 1000,
72 | "eval_iters": 10,
73 |
74 | "log_interval": 100,
75 | "steps_per_print": 10,
76 | "keep_last_n_checkpoints": 4,
77 | "wall_clock_breakdown": true,
78 |
79 | "hostfile": "/mock_path"
80 | }
81 |
--------------------------------------------------------------------------------
/configs/autotuning_configs/tune_6-7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "pipe-parallel-size": 1,
3 | "model-parallel-size": 8,
4 |
5 | "num-layers": 32,
6 | "hidden-size": 4096,
7 | "num-attention-heads": 32,
8 | "seq-length": 2048,
9 | "max-position-embeddings": 2048,
10 | "norm": "layernorm",
11 | "pos-emb": "rotary",
12 | "no-weight-tying": true,
13 |
14 | "scaled-upper-triang-masked-softmax-fusion": false,
15 | "bias-gelu-fusion": false,
16 |
17 |
18 | "optimizer": {
19 | "type": "Adam",
20 | "params": {
21 | "lr": 0.00012,
22 | "betas": [0.9, 0.999],
23 | "eps": 1.0e-8
24 | }
25 | },
26 |
27 | "train_micro_batch_size_per_gpu": 1,
28 | "zero_optimization": {
29 | "stage": [0, 1, 2, 3]
30 | },
31 | "data-impl": "mmap",
32 | "split": "949,50,1",
33 |
34 | "checkpoint-activations": true,
35 | "checkpoint-num-layers": 1,
36 | "partition-activations": true,
37 | "synchronize-each-layer": true,
38 |
39 | "gradient_clipping": 1.0,
40 | "weight-decay": 0,
41 | "hidden-dropout": 0,
42 | "attention-dropout": 0,
43 |
44 | "fp16": {
45 | "fp16": true,
46 | "enabled": true,
47 | "loss_scale": 0,
48 | "loss_scale_window": 1000,
49 | "hysteresis": 2,
50 | "min_loss_scale": 1
51 | },
52 |
53 | "train-iters": 100,
54 | "lr-decay-iters": 320000,
55 | "distributed-backend": "nccl",
56 | "lr-decay-style": "cosine",
57 | "warmup": 0.01,
58 | "checkpoint-factor": 10000,
59 | "eval-interval": 1000,
60 | "eval-iters": 10,
61 | "log-interval": 100,
62 | "steps_per_print": 10,
63 | "keep-last-n-checkpoints": 4,
64 | "wall_clock_breakdown": true,
65 | "launcher": "slurm",
66 | "deepspeed_slurm": true,
67 | "no_ssh_check": true,
68 | "comment": "neox",
69 | "autotuning": {
70 | "enabled": true,
71 | "mp_size": 8,
72 | "arg_mappings": {
73 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
74 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/configs/gmlp_small.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 | "attention_config": [[["gmlp"], "all"]],
8 |
9 |
10 | # model settings
11 | "num_layers": 12,
12 | "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
13 | "gmlp_attn_dim": 64,
14 | "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
15 | "seq_length": 2048,
16 | "max_position_embeddings": 2048,
17 | "norm": "layernorm",
18 | "pos_emb": "none",
19 | "no_weight_tying": true,
20 |
21 | # optimizer settings
22 | "optimizer": {
23 | "type": "Adam",
24 | "params": {
25 | "lr": 0.0006,
26 | "betas": [0.9, 0.999],
27 | "eps": 1.0e_8,
28 | }
29 | },
30 |
31 | # batch / data settings
32 | "train_micro_batch_size_per_gpu": 4,
33 | "data_impl": "mmap",
34 | "split": "949,50,1",
35 |
36 | # activation checkpointing
37 | "checkpoint_activations": true,
38 | "checkpoint_num_layers": 1,
39 | "partition_activations": false,
40 | "synchronize_each_layer": true,
41 |
42 | # regularization
43 | "gradient_clipping": 1.0,
44 | "weight_decay": 0.1,
45 | "hidden_dropout": 0.0,
46 | "attention_dropout": 0.0,
47 |
48 | # precision settings
49 | "fp16": {
50 | "enabled": true,
51 | "loss_scale": 0,
52 | "loss_scale_window": 1000,
53 | "hysteresis": 2,
54 | "min_loss_scale": 1
55 | },
56 |
57 | # misc. training settings
58 | "train_iters": 320000,
59 | "lr_decay_iters": 320000,
60 | "distributed_backend": "nccl",
61 | "lr_decay_style": "cosine",
62 | "warmup": 0.01,
63 | "checkpoint_factor": 10000,
64 | "eval_interval": 1000,
65 | "eval_iters": 10,
66 |
67 | # logging
68 | "log_interval": 100,
69 | "steps_per_print": 10,
70 | "keep_last_n_checkpoints": 4,
71 | "wall_clock_breakdown": true,
72 | }
73 |
--------------------------------------------------------------------------------
/megatron/model/fused_bias_dropout.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI contributors
2 | # This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from typing import Optional
19 | from torch import Tensor
20 |
21 | # flags required to enable jit fusion kernels
22 | torch._C._jit_set_profiling_mode(False)
23 | torch._C._jit_set_profiling_executor(False)
24 | torch._C._jit_override_can_fuse_on_cpu(True)
25 | torch._C._jit_override_can_fuse_on_gpu(True)
26 |
27 |
28 | def bias_dropout_add(
29 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool
30 | ) -> Tensor:
31 | out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
32 | if residual is not None:
33 | out = residual + out
34 | return out
35 |
36 |
37 | def get_bias_dropout_add(training):
38 | def _bias_dropout_add(x, bias, residual, prob):
39 | return bias_dropout_add(x, bias, residual, prob, training)
40 |
41 | return _bias_dropout_add
42 |
43 |
44 | @torch.jit.script
45 | def bias_dropout_add_fused_train(
46 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 | return bias_dropout_add(x, bias, residual, prob, True)
49 |
50 |
51 | @torch.jit.script
52 | def bias_dropout_add_fused_inference(
53 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
54 | ) -> Tensor:
55 | return bias_dropout_add(x, bias, residual, prob, False)
56 |
--------------------------------------------------------------------------------
/configs/autotuning_configs/tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "pipe-parallel-size": 1,
3 | "model-parallel-size": 1,
4 | "num-layers": 12,
5 | "hidden-size": 768,
6 | "num-attention-heads": 12,
7 | "seq-length": 2048,
8 | "max-position-embeddings": 2048,
9 | "norm": "layernorm",
10 | "pos-emb": "rotary",
11 | "no-weight-tying": true,
12 | "scaled-upper-triang-masked-softmax-fusion": true,
13 | "bias-gelu-fusion": true,
14 | "optimizer": {
15 | "type": "Adam",
16 | "params": {
17 | "lr": 0.0006,
18 | "betas": [0.9, 0.999],
19 | "eps": 1.0e-8
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 0,
24 | "allgather_partitions": true,
25 | "allgather_bucket_size": 500000000,
26 | "overlap_comm": true,
27 | "reduce_scatter": true,
28 | "reduce_bucket_size": 500000000,
29 | "contiguous_gradients": true,
30 | "cpu_offload": false
31 | },
32 | "train_micro_batch_size_per_gpu": 1,
33 | "autotuning_config": {
34 | "enabled": true,
35 | "arg_mappings": {
36 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
37 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
38 | }
39 | },
40 | "data-impl": "mmap",
41 | "split": "949,50,1",
42 | "checkpoint-activations": true,
43 | "checkpoint-num-layers": 1,
44 | "partition-activations": true,
45 | "synchronize-each-layer": true,
46 | "gradient_clipping": 1.0,
47 | "weight-decay": 0.0,
48 | "hidden-dropout": 0.0,
49 | "attention-dropout": 0.0,
50 | "fp16": {
51 | "enabled": true,
52 | "loss_scale": 0,
53 | "loss_scale_window": 1000,
54 | "hysteresis": 2,
55 | "min_loss_scale": 1
56 | },
57 | "train-iters": 200,
58 | "lr-decay-iters": 320000,
59 | "distributed-backend": "nccl",
60 | "lr-decay-style": "cosine",
61 | "warmup": 0.01,
62 | "save-interval": 10000,
63 | "eval-interval": 1000,
64 | "eval-iters": 10,
65 | "log-interval": 100,
66 | "steps_per_print": 10,
67 | "keep-last-n-checkpoints": 4,
68 | "wall_clock_breakdown": true,
69 | "launcher": "slurm",
70 | "deepspeed_slurm": true,
71 | "comment": "neox"
72 | }
73 |
--------------------------------------------------------------------------------
/configs/pythia/70M.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 6,
6 | "hidden_size": 512,
7 | "num_attention_heads": 8,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "pos_emb": "rotary",
11 | "rotary_pct": 0.25,
12 | "no_weight_tying": true,
13 | "gpt_j_residual": true,
14 | "output_layer_parallelism": "column",
15 |
16 | "attention_config": [[["flash"], 6]],
17 |
18 | "scaled_upper_triang_masked_softmax_fusion": true,
19 | "bias_gelu_fusion": true,
20 |
21 | "init_method": "small_init",
22 | "output_layer_init_method": "wang_init",
23 |
24 | "optimizer": {
25 | "type": "Adam",
26 | "params": {
27 | "lr": 0.001,
28 | "betas": [0.9, 0.95],
29 | "eps": 1.0e-8
30 | }
31 | },
32 | "min_lr": 0.0001,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 500000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 500000000,
41 | "contiguous_gradients": true,
42 | "cpu_offload": false
43 | },
44 |
45 | "train_micro_batch_size_per_gpu": 32,
46 | "data_impl": "mmap",
47 | "num_workers": 1,
48 |
49 | "checkpoint_activations": true,
50 | "checkpoint_num_layers": 1,
51 | "partition_activations": true,
52 | "synchronize_each_layer": true,
53 |
54 | "gradient_clipping": 1.0,
55 | "weight_decay": 0.1,
56 | "hidden_dropout": 0,
57 | "attention_dropout": 0,
58 |
59 | "fp16": {
60 | "fp16": true,
61 | "enabled": true,
62 | "loss_scale": 0,
63 | "loss_scale_window": 1000,
64 | "initial_scale_power": 12,
65 | "hysteresis": 2,
66 | "min_loss_scale": 1
67 | },
68 |
69 | "train_iters": 143000,
70 | "lr_decay_iters": 143000,
71 | "distributed_backend": "nccl",
72 | "lr_decay_style": "cosine",
73 | "warmup": 0.01,
74 | "checkpoint_factor": 1000,
75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 | "eval_interval": 100000,
77 | "eval_iters": 10,
78 |
79 | "log_interval": 10,
80 | "steps_per_print": 10,
81 | "wall_clock_breakdown": true,
82 |
83 | "tokenizer_type": "HFTokenizer"
84 | }
85 |
--------------------------------------------------------------------------------
/configs/pythia/160M.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 12,
6 | "hidden_size": 768,
7 | "num_attention_heads": 12,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "pos_emb": "rotary",
11 | "rotary_pct": 0.25,
12 | "no_weight_tying": true,
13 | "gpt_j_residual": true,
14 | "output_layer_parallelism": "column",
15 |
16 | "attention_config": [[["flash"], 12]],
17 |
18 | "scaled_upper_triang_masked_softmax_fusion": true,
19 | "bias_gelu_fusion": true,
20 |
21 | "init_method": "small_init",
22 | "output_layer_init_method": "wang_init",
23 |
24 | "optimizer": {
25 | "type": "Adam",
26 | "params": {
27 | "lr": 0.0006,
28 | "betas": [0.9, 0.95],
29 | "eps": 1.0e-8
30 | }
31 | },
32 | "min_lr": 0.00006,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 500000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 500000000,
41 | "contiguous_gradients": true,
42 | "cpu_offload": false
43 | },
44 |
45 | "train_micro_batch_size_per_gpu": 32,
46 | "data_impl": "mmap",
47 | "num_workers": 1,
48 |
49 | "checkpoint_activations": true,
50 | "checkpoint_num_layers": 1,
51 | "partition_activations": true,
52 | "synchronize_each_layer": true,
53 |
54 | "gradient_clipping": 1.0,
55 | "weight_decay": 0.1,
56 | "hidden_dropout": 0,
57 | "attention_dropout": 0,
58 |
59 | "fp16": {
60 | "fp16": true,
61 | "enabled": true,
62 | "loss_scale": 0,
63 | "loss_scale_window": 1000,
64 | "initial_scale_power": 12,
65 | "hysteresis": 2,
66 | "min_loss_scale": 1
67 | },
68 |
69 | "train_iters": 143000,
70 | "lr_decay_iters": 143000,
71 | "distributed_backend": "nccl",
72 | "lr_decay_style": "cosine",
73 | "warmup": 0.01,
74 | "checkpoint_factor": 1000,
75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 | "eval_interval": 143000,
77 | "eval_iters": 10,
78 |
79 | "log_interval": 10,
80 | "steps_per_print": 10,
81 | "wall_clock_breakdown": true,
82 |
83 | "tokenizer_type": "HFTokenizer"
84 | }
85 |
--------------------------------------------------------------------------------
/configs/pythia/1-4B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 24,
6 | "hidden_size": 2048,
7 | "num_attention_heads": 16,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "pos_emb": "rotary",
11 | "rotary_pct": 0.25,
12 | "no_weight_tying": true,
13 | "gpt_j_residual": true,
14 | "output_layer_parallelism": "column",
15 |
16 | "attention_config": [[["flash"], 24]],
17 |
18 | "scaled_upper_triang_masked_softmax_fusion": true,
19 | "bias_gelu_fusion": true,
20 |
21 | "init_method": "small_init",
22 | "output_layer_init_method": "wang_init",
23 |
24 | "optimizer": {
25 | "type": "Adam",
26 | "params": {
27 | "lr": 0.0002,
28 | "betas": [0.9, 0.95],
29 | "eps": 1.0e-8
30 | }
31 | },
32 | "min_lr": 0.00002,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 500000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 500000000,
41 | "contiguous_gradients": true,
42 | "cpu_offload": false
43 | },
44 |
45 | "train_micro_batch_size_per_gpu": 16,
46 | "data_impl": "mmap",
47 | "num_workers": 1,
48 |
49 | "checkpoint_activations": true,
50 | "checkpoint_num_layers": 1,
51 | "partition_activations": true,
52 | "synchronize_each_layer": true,
53 |
54 | "gradient_clipping": 1.0,
55 | "weight_decay": 0.1,
56 | "hidden_dropout": 0,
57 | "attention_dropout": 0,
58 |
59 | "fp16": {
60 | "fp16": true,
61 | "enabled": true,
62 | "loss_scale": 0,
63 | "loss_scale_window": 1000,
64 | "initial_scale_power": 12,
65 | "hysteresis": 2,
66 | "min_loss_scale": 1
67 | },
68 |
69 | "train_iters": 143000,
70 | "lr_decay_iters": 143000,
71 | "distributed_backend": "nccl",
72 | "lr_decay_style": "cosine",
73 | "warmup": 0.01,
74 | "checkpoint_factor": 1000,
75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 | "eval_interval": 143000,
77 | "eval_iters": 10,
78 |
79 |
80 | "log_interval": 10,
81 | "steps_per_print": 10,
82 | "wall_clock_breakdown": true,
83 | "tokenizer_type": "HFTokenizer"
84 | }
85 |
--------------------------------------------------------------------------------
/configs/pythia/410M.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 24,
6 | "hidden_size": 1024,
7 | "num_attention_heads": 16,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "pos_emb": "rotary",
11 | "rotary_pct": 0.25,
12 | "no_weight_tying": true,
13 | "gpt_j_residual": true,
14 | "output_layer_parallelism": "column",
15 |
16 | "attention_config": [[["flash"], 24]],
17 |
18 | "scaled_upper_triang_masked_softmax_fusion": true,
19 | "bias_gelu_fusion": true,
20 |
21 | "init_method": "small_init",
22 | "output_layer_init_method": "wang_init",
23 |
24 | "optimizer": {
25 | "type": "Adam",
26 | "params": {
27 | "lr": 0.0003,
28 | "betas": [0.9, 0.95],
29 | "eps": 1.0e-8
30 | }
31 | },
32 | "min_lr": 0.00003,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 500000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 500000000,
41 | "contiguous_gradients": true,
42 | "cpu_offload": false
43 | },
44 |
45 | "train_micro_batch_size_per_gpu": 32,
46 | "data_impl": "mmap",
47 | "num_workers": 1,
48 |
49 | "checkpoint_activations": true,
50 | "checkpoint_num_layers": 1,
51 | "partition_activations": true,
52 | "synchronize_each_layer": true,
53 |
54 | "gradient_clipping": 1.0,
55 | "weight_decay": 0.1,
56 | "hidden_dropout": 0,
57 | "attention_dropout": 0,
58 |
59 | "fp16": {
60 | "fp16": true,
61 | "enabled": true,
62 | "loss_scale": 0,
63 | "loss_scale_window": 1000,
64 | "initial_scale_power": 12,
65 | "hysteresis": 2,
66 | "min_loss_scale": 1
67 | },
68 |
69 | "train_iters": 143000,
70 | "lr_decay_iters": 143000,
71 | "distributed_backend": "nccl",
72 | "lr_decay_style": "cosine",
73 | "warmup": 0.01,
74 | "checkpoint_factor": 1000,
75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 | "eval_interval": 143000,
77 | "eval_iters": 10,
78 |
79 | "log_interval": 10,
80 | "steps_per_print": 10,
81 | "wall_clock_breakdown": true,
82 |
83 | "tokenizer_type": "HFTokenizer"
84 | }
85 |
--------------------------------------------------------------------------------
/configs/pythia/6-9B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 2,
4 |
5 | "num_layers": 32,
6 | "hidden_size": 4096,
7 | "num_attention_heads": 32,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "norm": "layernorm",
11 | "pos_emb": "rotary",
12 | "rotary_pct": 0.25,
13 | "no_weight_tying": true,
14 | "gpt_j_residual": true,
15 | "output_layer_parallelism": "column",
16 |
17 | "attention_config": [[["flash"], 32]],
18 |
19 | "scaled_upper_triang_masked_softmax_fusion": true,
20 | "bias_gelu_fusion": true,
21 |
22 |
23 | "optimizer": {
24 | "type": "Adam",
25 | "params": {
26 | "lr": 0.00012,
27 | "betas": [0.9, 0.95],
28 | "eps": 1.0e-8
29 | }
30 | },
31 |
32 | "min_lr": 0.000012,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 1260000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 1260000000,
41 | "contiguous_gradients": true,
42 | "cpu_offload": false
43 | },
44 |
45 | "train_micro_batch_size_per_gpu": 8,
46 | "gradient_accumulation_steps": 2,
47 | "data_impl": "mmap",
48 |
49 | "checkpoint_activations": true,
50 | "checkpoint_num_layers": 1,
51 | "partition_activations": true,
52 | "synchronize_each_layer": true,
53 |
54 | "gradient_clipping": 1.0,
55 | "weight_decay": 0.1,
56 | "hidden_dropout": 0,
57 | "attention_dropout": 0,
58 |
59 | "fp16": {
60 | "fp16": true,
61 | "enabled": true,
62 | "loss_scale": 0,
63 | "loss_scale_window": 1000,
64 | "initial_scale_power": 12,
65 | "hysteresis": 2,
66 | "min_loss_scale": 1
67 | },
68 |
69 | "train_iters": 143000,
70 | "lr_decay_iters": 143000,
71 | "distributed_backend": "nccl",
72 | "lr_decay_style": "cosine",
73 | "warmup": 0.01,
74 | "checkpoint_factor": 1000,
75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 | "eval_interval": 143000,
77 | "eval_iters": 10,
78 |
79 | "log_interval": 10,
80 | "steps_per_print": 10,
81 | "wall_clock_breakdown": true,
82 |
83 | "tokenizer_type": "HFTokenizer"
84 | }
85 |
--------------------------------------------------------------------------------
/configs/autotuning_configs/small_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "pipe-parallel-size": 1,
3 | "model-parallel-size": 1,
4 |
5 | "num-layers": 12,
6 | "hidden-size": 768,
7 | "num-attention-heads": 12,
8 | "seq-length": 2048,
9 | "max-position-embeddings": 2048,
10 | "norm": "layernorm",
11 | "pos-emb": "rotary",
12 | "no-weight-tying": true,
13 |
14 | "scaled-upper-triang-masked-softmax-fusion": false,
15 | "bias-gelu-fusion": false,
16 |
17 |
18 | "optimizer": {
19 | "type": "Adam",
20 | "params": {
21 | "lr": 0.0006,
22 | "betas": [0.9, 0.999],
23 | "eps": 1.0e-8
24 | }
25 | },
26 |
27 | "train_micro_batch_size_per_gpu": 1,
28 | "data-impl": "mmap",
29 | "split": "949,50,1",
30 |
31 | "checkpoint-activations": true,
32 | "checkpoint-num-layers": 1,
33 | "partition-activations": true,
34 | "synchronize-each-layer": true,
35 |
36 | "gradient_clipping": 1.0,
37 | "weight-decay": 0.0,
38 | "hidden-dropout": 0.0,
39 | "attention-dropout": 0.0,
40 |
41 | "fp16": {
42 | "enabled": true,
43 | "loss_scale": 0,
44 | "loss_scale_window": 1000,
45 | "hysteresis": 2,
46 | "min_loss_scale": 1
47 | },
48 |
49 | "train-iters": 320000,
50 | "lr-decay-iters": 320000,
51 | "distributed-backend": "nccl",
52 | "lr-decay-style": "cosine",
53 | "warmup": 0.01,
54 | "save-interval": 10000,
55 | "eval-interval": 1000,
56 | "eval-iters": 10,
57 |
58 | "log-interval": 100,
59 | "steps_per_print": 10,
60 | "keep-last-n-checkpoints": 4,
61 | "wall_clock_breakdown": true,
62 | "launcher": "slurm",
63 | "deepspeed_slurm": true,
64 | "comment": "neox",
65 | "autotuning": {
66 | "enabled": true,
67 | "arg_mappings": {
68 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
69 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
70 | }
71 | },
72 | "zero_optimization": {
73 | "stage": [0, 1, 2, 3]
74 | },
75 | "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
76 | "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
77 | "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
78 | }
79 |
--------------------------------------------------------------------------------
/configs/pythia/12B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 4,
4 |
5 | "num_layers": 36,
6 | "hidden_size": 5120,
7 | "num_attention_heads": 40,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "norm": "layernorm",
11 | "pos_emb": "rotary",
12 | "rotary_pct": 0.25,
13 | "no_weight_tying": true,
14 | "gpt_j_residual": true,
15 | "output_layer_parallelism": "column",
16 |
17 | "attention_config": [[["flash"], 36]],
18 |
19 | "scaled_upper_triang_masked_softmax_fusion": true,
20 | "bias_gelu_fusion": true,
21 |
22 | "optimizer": {
23 | "type": "Adam",
24 | "params": {
25 | "lr": 0.00012,
26 | "betas": [0.9, 0.95],
27 | "eps": 1.0e-8
28 | }
29 | },
30 | "min_lr": 0.000012,
31 |
32 | "zero_optimization": {
33 | "stage": 1,
34 | "allgather_partitions": true,
35 | "allgather_bucket_size": 1260000000,
36 | "overlap_comm": true,
37 | "reduce_scatter": true,
38 | "reduce_bucket_size": 1260000000,
39 | "contiguous_gradients": true,
40 | "cpu_offload": false
41 | },
42 |
43 | "train_micro_batch_size_per_gpu": 8,
44 | "gradient_accumulation_steps": 2,
45 | "data_impl": "mmap",
46 |
47 | "checkpoint_activations": true,
48 | "checkpoint_num_layers": 1,
49 | "partition_activations": true,
50 | "synchronize_each_layer": true,
51 |
52 | "gradient_clipping": 1.0,
53 | "weight_decay": 0.1,
54 | "hidden_dropout": 0,
55 | "attention_dropout": 0,
56 |
57 | "fp16": {
58 | "fp16": true,
59 | "enabled": true,
60 | "loss_scale": 0,
61 | "loss_scale_window": 1000,
62 | "initial_scale_power": 12,
63 | "hysteresis": 2,
64 | "min_loss_scale": 1
65 | },
66 |
67 | "train_iters": 143000,
68 | "lr_decay_iters": 143000,
69 | "distributed_backend": "nccl",
70 | "lr_decay_style": "cosine",
71 | "warmup": 0.01,
72 | "checkpoint_factor": 1000,
73 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
74 | "eval_interval": 143000,
75 | "eval_iters": 10,
76 |
77 | "log_interval": 10,
78 | "steps_per_print": 10,
79 | "wall_clock_breakdown": true,
80 |
81 | "log_grad_norm": true,
82 |
83 | "tokenizer_type": "HFTokenizer"
84 | }
85 |
--------------------------------------------------------------------------------
/configs/pythia/1B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 16,
6 | "hidden_size": 2048,
7 | "num_attention_heads": 8,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "pos_emb": "rotary",
11 | "rotary_pct": 0.25,
12 | "no_weight_tying": true,
13 | "gpt_j_residual": true,
14 | "output_layer_parallelism": "column",
15 |
16 | "scaled_upper_triang_masked_softmax_fusion": true,
17 | "bias_gelu_fusion": true,
18 |
19 | "init_method": "small_init",
20 | "output_layer_init_method": "wang_init",
21 |
22 | "optimizer": {
23 | "type": "Adam",
24 | "params": {
25 | "lr": 0.00025,
26 | "betas": [0.9, 0.95],
27 | "eps": 1.0e-8
28 | }
29 | },
30 | "min_lr": 0.000025,
31 |
32 | "zero_optimization": {
33 | "stage": 0,
34 | "allgather_partitions": true,
35 | "allgather_bucket_size": 500000000,
36 | "overlap_comm": true,
37 | "reduce_scatter": true,
38 | "reduce_bucket_size": 500000000,
39 | "contiguous_gradients": true,
40 | "cpu_offload": false
41 | },
42 |
43 | "fp16": {
44 | "enabled": true,
45 | "type": "bfloat16",
46 | "auto_cast": true,
47 | "loss_scale": 0,
48 | "loss_scale_window": 1000,
49 | "initial_scale_power": 12,
50 | "hysteresis": 2,
51 | "min_loss_scale": 1
52 | },
53 |
54 | "fp32_allreduce": true,
55 |
56 | "train_micro_batch_size_per_gpu": 4,
57 | "gradient_accumulation_steps": 4,
58 | "data_impl": "mmap",
59 | "num_workers": 1,
60 |
61 | "checkpoint_activations": true,
62 | "checkpoint_num_layers": 1,
63 | "partition_activations": true,
64 | "synchronize_each_layer": true,
65 |
66 | "gradient_clipping": 1.0,
67 | "weight_decay": 0.1,
68 | "hidden_dropout": 0,
69 | "attention_dropout": 0,
70 |
71 | "train_iters": 143000,
72 | "lr_decay_iters": 143000,
73 | "distributed_backend": "nccl",
74 | "lr_decay_style": "cosine",
75 | "warmup": 0.01,
76 | "checkpoint_factor": 1000,
77 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
78 | "eval_interval": 143000,
79 | "eval_iters": 10,
80 |
81 | "log_interval": 10,
82 | "steps_per_print": 10,
83 | "wall_clock_breakdown": true,
84 |
85 | "tokenizer_type": "HFTokenizer"
86 | }
87 |
--------------------------------------------------------------------------------
/configs/pythia/2-8B.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | "num_layers": 32,
6 | "hidden_size": 2560,
7 | "num_attention_heads": 32,
8 | "seq_length": 2048,
9 | "max_position_embeddings": 2048,
10 | "pos_emb": "rotary",
11 | "rotary_pct": 0.25,
12 | "no_weight_tying": true,
13 | "gpt_j_residual": true,
14 | "output_layer_parallelism": "column",
15 |
16 | "attention_config": [[["flash"], 32]],
17 |
18 | "scaled_upper_triang_masked_softmax_fusion": true,
19 | "bias_gelu_fusion": true,
20 |
21 | "init_method": "small_init",
22 | "output_layer_init_method": "wang_init",
23 |
24 | "optimizer": {
25 | "type": "Adam",
26 | "params": {
27 | "lr": 0.00016,
28 | "betas": [0.9, 0.95],
29 | "eps": 1.0e-8
30 | }
31 | },
32 | "min_lr": 0.000016,
33 |
34 | "zero_optimization": {
35 | "stage": 1,
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 500000000,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": 500000000,
41 | "contiguous_gradients": true,
42 | "cpu_offload": false
43 | },
44 |
45 | "train_micro_batch_size_per_gpu": 8,
46 | "gradient_accumulation_steps": 2,
47 | "data_impl": "mmap",
48 | "num_workers": 1,
49 |
50 | "checkpoint_activations": true,
51 | "checkpoint_num_layers": 1,
52 | "partition_activations": true,
53 | "synchronize_each_layer": true,
54 |
55 | "gradient_clipping": 1.0,
56 | "weight_decay": 0.1,
57 | "hidden_dropout": 0,
58 | "attention_dropout": 0,
59 |
60 | "fp16": {
61 | "fp16": true,
62 | "enabled": true,
63 | "loss_scale": 0,
64 | "loss_scale_window": 1000,
65 | "initial_scale_power": 12,
66 | "hysteresis": 2,
67 | "min_loss_scale": 1
68 | },
69 |
70 | "train_iters": 143000,
71 | "lr_decay_iters": 143000,
72 | "distributed_backend": "nccl",
73 | "lr_decay_style": "cosine",
74 | "warmup": 0.01,
75 | "checkpoint_factor": 1000,
76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 | "eval_interval": 40000,
78 | "eval_iters": 10,
79 |
80 | "log_grad_norm": true,
81 |
82 | "log_interval": 10,
83 | "steps_per_print": 10,
84 | "wall_clock_breakdown": true,
85 |
86 | "tokenizer_type": "HFTokenizer"
87 | }
88 |
--------------------------------------------------------------------------------
/configs/800M.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | # model settings
6 | "num_layers": 16,
7 | "hidden_size": 2048,
8 | "num_attention_heads": 8,
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "pos_emb": "rotary",
12 | "no_weight_tying": true,
13 | "gpt_j_residual": false,
14 | "output_layer_parallelism": "column",
15 |
16 | "scaled_upper_triang_masked_softmax_fusion": false,
17 | "bias_gelu_fusion": false,
18 | "rope_fusion": false,
19 | "layernorm_fusion": false,
20 |
21 | # init methods
22 | "init_method": "small_init",
23 | "output_layer_init_method": "wang_init",
24 |
25 | "optimizer": {
26 | "type": "Adam",
27 | "params": {
28 | "lr": 0.00025,
29 | "betas": [0.9, 0.95],
30 | "eps": 1.0e-8,
31 | }
32 | },
33 | "min_lr": 0.000025,
34 |
35 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36 | "zero_optimization": {
37 | "stage": 1,
38 | "allgather_partitions": True,
39 | "allgather_bucket_size": 500000000,
40 | "overlap_comm": True,
41 | "reduce_scatter": True,
42 | "reduce_bucket_size": 500000000,
43 | "contiguous_gradients": True,
44 | },
45 |
46 | "train_micro_batch_size_per_gpu": 16,
47 | "gradient_accumulation_steps": 1,
48 | "data_impl": "mmap",
49 | "num_workers": 1,
50 |
51 | # activation checkpointing
52 | "checkpoint_activations": true,
53 | "checkpoint_num_layers": 1,
54 | "partition_activations": true,
55 | "synchronize_each_layer": true,
56 |
57 | # regularization
58 | "gradient_clipping": 1.0,
59 | "weight_decay": 0.1,
60 | "hidden_dropout": 0,
61 | "attention_dropout": 0,
62 |
63 | # precision settings
64 | "fp16": {
65 | "fp16": true,
66 | "enabled": true,
67 | "loss_scale": 0,
68 | "loss_scale_window": 1000,
69 | "initial_scale_power": 12,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1,
72 | },
73 |
74 | "train_iters": 143000,
75 | "lr_decay_iters": 143000,
76 | "distributed_backend": "nccl",
77 | "lr_decay_style": "cosine",
78 | "warmup": 0.01,
79 | "checkpoint_factor": 1000,
80 | "eval_interval": 40000,
81 | "eval_iters": 10,
82 |
83 | "log_interval": 10,
84 | "steps_per_print": 10,
85 | "wall_clock_breakdown": true,
86 | }
87 |
--------------------------------------------------------------------------------
/tests/config/test_setup.yml:
--------------------------------------------------------------------------------
1 | # 19M parameter model, & local setup with some additional simplifications
2 | {
3 | # Settings to make the test setup as lightweight as possible
4 | "data_path": "data/enwik8/enwik8_text_document",
5 | "vocab_file": "data/gpt2-vocab.json",
6 | "merge_file": "data/gpt2-merges.txt",
7 | "lr_decay_iters": 20,
8 | "train_iters": 20,
9 | "hostfile": "None",
10 | "include": "localhost:1",
11 | "use_wandb": False,
12 |
13 | # Settings copied from 19M parameter config (some modifications above, meaning we can't use configs/19M.yml directly)
14 | "pipe_parallel_size": 1,
15 | "model_parallel_size": 1,
16 |
17 | # model settings
18 | "num_layers": 2,
19 | "hidden_size": 8,
20 | "num_attention_heads": 4,
21 | "seq_length": 1024,
22 | "max_position_embeddings": 1024,
23 | "pos_emb": "rotary",
24 | "no_weight_tying": true,
25 | "gpt_j_residual": false,
26 | "output_layer_parallelism": "column",
27 |
28 | "scaled_upper_triang_masked_softmax_fusion": false,
29 | "bias_gelu_fusion": false,
30 | "rope_fusion": false,
31 | "layernorm_fusion": false,
32 |
33 | # Optimizer
34 | "optimizer": {
35 | "type": "sm3",
36 | "params": {},
37 | },
38 |
39 | # precision
40 | "precision": "fp16",
41 |
42 | # init methods
43 | "init_method": "small_init",
44 | "output_layer_init_method": "wang_init",
45 |
46 | "train_micro_batch_size_per_gpu": 4,
47 | "gradient_accumulation_steps": 1,
48 | "data_impl": "mmap",
49 | "num_workers": 1,
50 |
51 | # activation checkpointing
52 | "checkpoint_activations": true,
53 | "checkpoint_num_layers": 1,
54 | "partition_activations": true,
55 | "synchronize_each_layer": true,
56 |
57 | # regularization
58 | "gradient_clipping": 1.0,
59 | "weight_decay": 0.1,
60 | "hidden_dropout": 0,
61 | "attention_dropout": 0,
62 |
63 | "distributed_backend": "nccl",
64 | "lr_decay_style": "cosine",
65 | "warmup": 0.01,
66 | "checkpoint_factor": 1000,
67 | "eval_interval": 100000,
68 | "eval_iters": 10,
69 |
70 | "log_interval": 10,
71 | "steps_per_print": 10,
72 | "wall_clock_breakdown": true,
73 |
74 | # additional deepspeed args not specified above
75 | "deepspeed_extra_args": {
76 | "comms_logger": {
77 | "enabled": true,
78 | "verbose": true,
79 | "prof_all": true,
80 | "debug": false
81 | },
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/configs/finetuning_configs/6-9B.yml:
--------------------------------------------------------------------------------
1 | {
2 | # finetuning option
3 | "load": "/path/to/checkpoint",
4 | "finetune": true,
5 |
6 | "pipe-parallel-size": 1,
7 | "model-parallel-size": 2,
8 |
9 | "num-layers": 32,
10 | "hidden-size": 4096,
11 | "num-attention-heads": 32,
12 | "seq-length": 2048,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "rotary",
16 | "rotary_pct": 0.25,
17 | "no-weight-tying": true,
18 | "gpt_j_residual": true,
19 | "output_layer_parallelism": "column",
20 |
21 | "attention-config": [[["flash"], 32]],
22 |
23 | "scaled-upper-triang-masked-softmax-fusion": true,
24 | "bias-gelu-fusion": true,
25 |
26 |
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.00012,
31 | "betas": [0.9, 0.95],
32 | "eps": 1.0e-8
33 | }
34 | },
35 |
36 | "min_lr": 0.000012,
37 |
38 | "zero_optimization": {
39 | "stage": 1,
40 | "allgather_partitions": true,
41 | "allgather_bucket_size": 1260000000,
42 | "overlap_comm": true,
43 | "reduce_scatter": true,
44 | "reduce_bucket_size": 1260000000,
45 | "contiguous_gradients": true,
46 | "cpu_offload": false,
47 | "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
48 | },
49 |
50 | "train_micro_batch_size_per_gpu": 8,
51 | "gradient_accumulation_steps": 2,
52 | "data-impl": "mmap",
53 |
54 | "checkpoint-activations": true,
55 | "checkpoint-num-layers": 1,
56 | "partition-activations": true,
57 | "synchronize-each-layer": true,
58 |
59 | "gradient_clipping": 1.0,
60 | "weight-decay": 0.1,
61 | "hidden-dropout": 0,
62 | "attention-dropout": 0,
63 |
64 | "fp16": {
65 | "fp16": true,
66 | "enabled": true,
67 | "loss_scale": 0,
68 | "loss_scale_window": 1000,
69 | "initial_scale_power": 12,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1
72 | },
73 |
74 | "train-iters": 143000,
75 | "lr-decay-iters": 143000,
76 | "distributed-backend": "nccl",
77 | "lr-decay-style": "cosine",
78 | "warmup": 0.01,
79 | "checkpoint-factor": 1000,
80 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
81 | "eval-interval": 143000,
82 | "eval-iters": 10,
83 |
84 | "log-interval": 10,
85 | "steps_per_print": 10,
86 | "wall_clock_breakdown": true,
87 |
88 | "tokenizer_type": "HFTokenizer"
89 | }
90 |
--------------------------------------------------------------------------------
/configs/autotuning_configs/tune_1-3B.json:
--------------------------------------------------------------------------------
1 | {
2 | "pipe-parallel-size": 1,
3 | "model-parallel-size": 1,
4 |
5 | "num-layers": 24,
6 | "hidden-size": 2048,
7 | "num-attention-heads": 16,
8 | "seq-length": 2048,
9 | "max-position-embeddings": 2048,
10 | "norm": "layernorm",
11 | "pos-emb": "rotary",
12 | "no-weight-tying": true,
13 | "gpt_j_residual": false,
14 | "output_layer_parallelism": "column",
15 | "attention_config": [[["flash"], 24]],
16 | "scaled-upper-triang-masked-softmax-fusion": false,
17 | "bias-gelu-fusion": false,
18 |
19 | "init_method": "small_init",
20 | "output_layer_init_method": "wang_init",
21 |
22 | "optimizer": {
23 | "type": "Adam",
24 | "params": {
25 | "lr": 0.0002,
26 | "betas": [0.9, 0.95],
27 | "eps": 1.0e-8
28 | }
29 | },
30 | "min_lr": 0.00002,
31 |
32 | "zero_optimization": {
33 | "stage": 1,
34 | "allgather_partitions": true,
35 | "allgather_bucket_size": 500000000,
36 | "overlap_comm": true,
37 | "reduce_scatter": true,
38 | "reduce_bucket_size": 500000000,
39 | "contiguous_gradients": true
40 | },
41 | "train_micro_batch_size_per_gpu": 1,
42 | "autotuning": {
43 | "enabled": true,
44 | "arg_mappings": {
45 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
46 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
47 | }
48 | },
49 | "data-impl": "mmap",
50 |
51 | "checkpoint-activations": false,
52 | "checkpoint-num-layers": 1,
53 | "partition-activations": true,
54 | "synchronize-each-layer": true,
55 |
56 | "gradient_clipping": 1.0,
57 | "weight-decay": 0.1,
58 | "hidden-dropout": 0,
59 | "attention-dropout": 0,
60 |
61 | "fp16": {
62 | "fp16": true,
63 | "enabled": true,
64 | "loss_scale": 0,
65 | "loss_scale_window": 1000,
66 | "hysteresis": 2,
67 | "min_loss_scale": 1
68 | },
69 |
70 | "train-iters": 320000,
71 | "lr-decay-iters": 320000,
72 | "distributed-backend": "nccl",
73 | "lr-decay-style": "cosine",
74 | "warmup": 0.01,
75 | "checkpoint-factor": 10000,
76 | "eval-interval": 1000,
77 | "eval-iters": 10,
78 | "launcher": "slurm",
79 | "deepspeed_slurm": true,
80 | "no_ssh_check": true,
81 |
82 | "log-interval": 10,
83 | "steps_per_print": 10,
84 | "keep-last-n-checkpoints": 1,
85 | "wall_clock_breakdown": true
86 | }
87 |
--------------------------------------------------------------------------------
/configs/bf16_125M.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 12,
10 | "hidden_size": 768,
11 | "num_attention_heads": 12,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 |
18 | # these should provide some speedup but takes a while to build, set to true if desired
19 | "scaled_upper_triang_masked_softmax_fusion": false,
20 | "bias_gelu_fusion": false,
21 | "rope_fusion": false,
22 | "layernorm_fusion": false,
23 |
24 |
25 | # optimizer settings
26 | "optimizer": {
27 | "type": "Adam",
28 | "params": {
29 | "lr": 0.0006,
30 | "betas": [0.9, 0.999],
31 | "eps": 1.0e-8,
32 | }
33 | },
34 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
35 | "zero_optimization": {
36 | "stage": 0,
37 | "allgather_partitions": True,
38 | "allgather_bucket_size": 500000000,
39 | "overlap_comm": True,
40 | "reduce_scatter": True,
41 | "reduce_bucket_size": 500000000,
42 | "contiguous_gradients": True,
43 | },
44 |
45 | # batch / data settings
46 | "train_micro_batch_size_per_gpu": 4,
47 | "data_impl": "mmap",
48 | "split": "949,50,1",
49 |
50 | # activation checkpointing
51 | "checkpoint_activations": true,
52 | "checkpoint_num_layers": 1,
53 | "partition_activations": true,
54 | "synchronize_each_layer": true,
55 |
56 | # regularization
57 | "gradient_clipping": 1.0,
58 | "weight_decay": 0.0,
59 | "hidden_dropout": 0.0,
60 | "attention_dropout": 0.0,
61 |
62 | "precision": "bfloat16",
63 |
64 | "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
65 | # misc. training settings
66 | "train_iters": 320000,
67 | "lr_decay_iters": 320000,
68 | "distributed_backend": "nccl",
69 | "lr_decay_style": "cosine",
70 | "warmup": 0.01,
71 | "checkpoint_factor": 10000,
72 | "eval_interval": 1000,
73 | "eval_iters": 10,
74 |
75 | # logging
76 | "log_interval": 100,
77 | "steps_per_print": 10,
78 | "keep_last_n_checkpoints": 4,
79 | "wall_clock_breakdown": true,
80 | }
81 |
--------------------------------------------------------------------------------
/post-training/recreating_zephyr_dpo.md:
--------------------------------------------------------------------------------
1 | # Initial setup
2 |
3 | ```bash
4 | python tools/ckpts/convert_hf_llama_to_neox.py --tp 2 --model HuggingFaceH4/mistral-7b-sft-beta --model_path checkpoints/neox_converted/zephyr-sft_tp2
5 | ```
6 |
7 |
8 | # To generate data
9 | First make a new environment... We want to keep the same data between runs so the easiest way is to create a new conda
10 | environment and follow the steps below.
11 | ```
12 | conda create -n handbook python=3.10 && conda activate handbook
13 | git clone https://github.com/huggingface/alignment-handbook.git
14 | cd ./alignment-handbook/
15 | python -m pip install .
16 | python -m pip install jsonlines
17 | ```
18 |
19 | ## DPO data
20 | ```bash
21 | # from the gpt-neox repo
22 | conda activate handbook
23 | python post-training/dpo_data.py
24 | conda deactivate
25 | # activate your neox conda environment, or whatever you need to switch to the neox environment
26 | mkdir data
27 | mkdir data/pairwise
28 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
29 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
30 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
31 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
32 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
33 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
34 | ```
35 |
36 | ## Running
37 | ```bash
38 | python deepy.py train.py post-training/configs/benchmarking/mistral-dpo.yml
39 | ```
40 |
--------------------------------------------------------------------------------
/configs/mamba/mamba-130M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # Parallelism is not yet supported for Mamba
3 | "pipe_parallel_size": 0,
4 | "model_parallel_size": 1,
5 |
6 | "num_layers": 24,
7 | "hidden_size": 768,
8 | "num_attention_heads": 12, # ignored when using mamba
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "output_layer_parallelism": "column",
12 | "norm": "rmsnorm",
13 | "rms_norm_epsilon": 1.0e-5,
14 |
15 | "attention_config": [[["mamba"], 24]],
16 |
17 | "mamba_selective_scan_fusion": true,
18 | "mamba_causal_conv_fusion": true,
19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 | "activation": "silu",
21 |
22 | # init methods
23 | "init_method": "small_init",
24 | "output_layer_init_method": "single_residual_scaled_normal",
25 |
26 |
27 | # optimizer settings
28 | "optimizer": {
29 | "type": "Adam",
30 | "params": {
31 | "lr": 0.0006,
32 | "betas": [0.9, 0.95],
33 | "eps": 1.0e-8,
34 | }
35 | },
36 | "min_lr": 0.00006,
37 |
38 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
39 | "zero_optimization": {
40 | "stage": 1,
41 | "allgather_partitions": True,
42 | "allgather_bucket_size": 500000000,
43 | "overlap_comm": True,
44 | "reduce_scatter": True,
45 | "reduce_bucket_size": 500000000,
46 | "contiguous_gradients": True,
47 | },
48 |
49 | # batch / data settings
50 | "train_micro_batch_size_per_gpu": 4,
51 | "data_impl": "mmap",
52 |
53 | # activation checkpointing
54 | "checkpoint_activations": true,
55 | "checkpoint_num_layers": 1,
56 | "partition_activations": true,
57 | "synchronize_each_layer": true,
58 |
59 | # regularization
60 | "gradient_clipping": 1.0,
61 | "weight_decay": 0.1,
62 | "hidden_dropout": 0.0,
63 | "attention_dropout": 0.0,
64 |
65 | # precision settings
66 | "fp16": {
67 | "enabled": true,
68 | "loss_scale": 0,
69 | "loss_scale_window": 1000,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1
72 | },
73 |
74 | # misc. training settings
75 | "train_iters": 320000,
76 | "lr_decay_iters": 320000,
77 | "distributed_backend": "nccl",
78 | "lr_decay_style": "cosine",
79 | "warmup": 0.01,
80 | "checkpoint_factor": 10000,
81 | "eval_interval": 1000,
82 | "eval_iters": 10,
83 |
84 | # logging
85 | "log_interval": 100,
86 | "steps_per_print": 10,
87 | "keep_last_n_checkpoints": 4,
88 | "wall_clock_breakdown": true,
89 | }
90 |
--------------------------------------------------------------------------------
/configs/mamba/mamba-370M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # Parallelism is not yet supported for Mamba
3 | "pipe_parallel_size": 0,
4 | "model_parallel_size": 1,
5 |
6 | "num_layers": 48,
7 | "hidden_size": 1024,
8 | "num_attention_heads": 12, # ignored when using mamba
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "output_layer_parallelism": "column",
12 | "norm": "rmsnorm",
13 | "rms_norm_epsilon": 1.0e-5,
14 |
15 | "attention_config": [[["mamba"], 48]],
16 |
17 | "mamba_selective_scan_fusion": true,
18 | "mamba_causal_conv_fusion": true,
19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 | "activation": "silu",
21 |
22 | # init methods
23 | "init_method": "small_init",
24 | "output_layer_init_method": "single_residual_scaled_normal",
25 |
26 | # optimizer settings
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.0003,
31 | "betas": [0.9, 0.95],
32 | "eps": 1.0e-8,
33 | }
34 | },
35 | "min_lr": 0.00003,
36 |
37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 | "zero_optimization": {
39 | "stage": 1,
40 | "allgather_partitions": True,
41 | "allgather_bucket_size": 500000000,
42 | "overlap_comm": True,
43 | "reduce_scatter": True,
44 | "reduce_bucket_size": 500000000,
45 | "contiguous_gradients": True,
46 | },
47 | # batch / data settings
48 | "train_micro_batch_size_per_gpu": 4,
49 | "data_impl": "mmap",
50 |
51 | # activation checkpointing
52 | "checkpoint_activations": true,
53 | "checkpoint_num_layers": 1,
54 | "partition_activations": true,
55 | "synchronize_each_layer": true,
56 |
57 | # regularization
58 | "gradient_clipping": 1.0,
59 | "weight_decay": 0.1,
60 | "hidden_dropout": 0,
61 | "attention_dropout": 0,
62 |
63 | # precision settings
64 | "fp16": {
65 | "fp16": true,
66 | "enabled": true,
67 | "loss_scale": 0,
68 | "loss_scale_window": 1000,
69 | "hysteresis": 2,
70 | "min_loss_scale": 1
71 | },
72 |
73 | # misc. training settings
74 | "train_iters": 320000,
75 | "lr_decay_iters": 320000,
76 | "distributed_backend": "nccl",
77 | "lr_decay_style": "cosine",
78 | "warmup": 0.01,
79 | "checkpoint_factor": 10000,
80 | "eval_interval": 1000,
81 | "eval_iters": 10,
82 |
83 | # logging
84 | "log_interval": 100,
85 | "steps_per_print": 10,
86 | "keep_last_n_checkpoints": 4,
87 | "wall_clock_breakdown": true,
88 | }
89 |
--------------------------------------------------------------------------------
/tools/datasets/multinode_prepare_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # USAGE:
4 | # This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
5 | # over the processes.
6 | # This bash script takes a single text file as input argument.
7 | # The text file contains a valid filepath in each line, leading to a jsonl-file.
8 | # Furthermore an environment variable for the rank and the world size needs to be set.
9 | # These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
10 | # using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
11 | # You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.
12 |
13 | # Parse command-line arguments
14 | text_file="$1"
15 | rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}"
16 | world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}"
17 | num_lines=$(wc -l < "$text_file")
18 | chunk_size=$((num_lines / world_size))
19 | start_line=$((rank * chunk_size + 1))
20 | end_line=$((start_line + chunk_size - 1))
21 |
22 | # Make sure the last chunk includes all remaining lines
23 | if [[ $rank == $((world_size - 1)) ]]; then
24 | end_line=$num_lines
25 | fi
26 |
27 | # Select the chunk of the text file that corresponds to the rank
28 | chunk_file="chunk_${rank}.txt"
29 | sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file"
30 |
31 | # Parse additional flags to be passed to the Python script
32 | shift 1 # Shift past the first three arguments
33 | py_args=""
34 | prefix_arg=""
35 | while [[ $# -gt 0 ]]; do
36 | case "$1" in
37 | --output-prefix=*) prefix_arg="$1"; shift;;
38 | --output-prefix) prefix_arg="$1 $2"; shift 2;;
39 | --*) py_args="$py_args $1 $2"; shift 2;;
40 | *) echo "Unknown argument: $1"; exit 1;;
41 | esac
42 | done
43 |
44 | # Add the rank to the --output-prefix argument if it is set
45 | if [[ -n "$prefix_arg" ]]; then
46 | py_args="$py_args $prefix_arg$rank"
47 | else
48 | # Inject a default --output-prefix argument containing the rank
49 | py_args="$py_args --output-prefix rank${rank}"
50 | fi
51 |
52 |
53 | echo "processing $chunk_file with rank $rank at world size $world_size"
54 | echo "using the following args: $py_args"
55 | # Call the Python script with the list of file paths in the chunk
56 | python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args
57 |
58 | # Clean up
59 | rm "$chunk_file"
60 |
--------------------------------------------------------------------------------
/configs/mamba/mamba-1.4B.yml:
--------------------------------------------------------------------------------
1 | {
2 | # Parallelism is not yet supported for Mamba
3 | "pipe_parallel_size": 0,
4 | "model_parallel_size": 1,
5 |
6 | "num_layers": 48,
7 | "hidden_size": 2048,
8 | "num_attention_heads": 12, # ignored when using mamba
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "output_layer_parallelism": "column",
12 | "norm": "rmsnorm",
13 | "rms_norm_epsilon": 1.0e-5,
14 |
15 | "attention_config": [[["mamba"], 48]],
16 |
17 | "mamba_selective_scan_fusion": true,
18 | "mamba_causal_conv_fusion": true,
19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 | "activation": "silu",
21 |
22 | # init methods
23 | "init_method": "small_init",
24 | "output_layer_init_method": "single_residual_scaled_normal",
25 |
26 | # optimizer settings
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.0002,
31 | "betas": [0.9, 0.95],
32 | "eps": 1.0e-8,
33 | }
34 | },
35 | "min_lr": 0.00002,
36 |
37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 | "zero_optimization": {
39 | "stage": 1,
40 | "allgather_partitions": True,
41 | "allgather_bucket_size": 500000000,
42 | "overlap_comm": True,
43 | "reduce_scatter": True,
44 | "reduce_bucket_size": 500000000,
45 | "contiguous_gradients": True,
46 | },
47 |
48 | # batch / data settings
49 | "train_micro_batch_size_per_gpu": 4,
50 | "data_impl": "mmap",
51 |
52 | # activation checkpointing
53 | "checkpoint_activations": true,
54 | "checkpoint_num_layers": 1,
55 | "partition_activations": true,
56 | "synchronize_each_layer": true,
57 |
58 | # regularization
59 | "gradient_clipping": 1.0,
60 | "weight_decay": 0.1,
61 | "hidden_dropout": 0,
62 | "attention_dropout": 0,
63 |
64 | # precision settings
65 | "fp16": {
66 | "fp16": true,
67 | "enabled": true,
68 | "loss_scale": 0,
69 | "loss_scale_window": 1000,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1
72 | },
73 |
74 | # misc. training settings
75 | "train_iters": 320000,
76 | "lr_decay_iters": 320000,
77 | "distributed_backend": "nccl",
78 | "lr_decay_style": "cosine",
79 | "warmup": 0.01,
80 | "checkpoint_factor": 10000,
81 | "eval_interval": 1000,
82 | "eval_iters": 10,
83 |
84 | # logging
85 | "log_interval": 1,
86 | "steps_per_print": 10,
87 | "keep_last_n_checkpoints": 4,
88 | "wall_clock_breakdown": true,
89 | }
90 |
--------------------------------------------------------------------------------
/configs/mamba/mamba-2.8B.yml:
--------------------------------------------------------------------------------
1 | {
2 | # Parallelism is not yet supported for Mamba
3 | "pipe_parallel_size": 0,
4 | "model_parallel_size": 1,
5 |
6 | "num_layers": 64,
7 | "hidden_size": 2560,
8 | "num_attention_heads": 12, # ignored when using mamba
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "output_layer_parallelism": "column",
12 | "norm": "rmsnorm",
13 | "rms_norm_epsilon": 1.0e-5,
14 |
15 | "attention_config": [[["mamba"], 64]],
16 |
17 | "mamba_selective_scan_fusion": true,
18 | "mamba_causal_conv_fusion": true,
19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 | "activation": "silu",
21 |
22 | # init methods
23 | "init_method": "small_init",
24 | "output_layer_init_method": "single_residual_scaled_normal",
25 |
26 | # optimizer settings
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.00016,
31 | "betas": [0.9, 0.95],
32 | "eps": 1.0e-8,
33 | }
34 | },
35 | "min_lr": 0.000016,
36 |
37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 | "zero_optimization": {
39 | "stage": 1,
40 | "allgather_partitions": True,
41 | "allgather_bucket_size": 500000000,
42 | "overlap_comm": True,
43 | "reduce_scatter": True,
44 | "reduce_bucket_size": 500000000,
45 | "contiguous_gradients": True,
46 | },
47 |
48 | # batch / data settings
49 | "train_micro_batch_size_per_gpu": 4,
50 | "data_impl": "mmap",
51 |
52 | # activation checkpointing
53 | "checkpoint_activations": true,
54 | "checkpoint_num_layers": 1,
55 | "partition_activations": true,
56 | "synchronize_each_layer": true,
57 |
58 | # regularization
59 | "gradient_clipping": 1.0,
60 | "weight_decay": 0.1,
61 | "hidden_dropout": 0,
62 | "attention_dropout": 0,
63 |
64 | # precision settings
65 | "fp16": {
66 | "fp16": true,
67 | "enabled": true,
68 | "loss_scale": 0,
69 | "loss_scale_window": 1000,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1
72 | },
73 |
74 | # misc. training settings
75 | "train_iters": 320000,
76 | "lr_decay_iters": 320000,
77 | "distributed_backend": "nccl",
78 | "lr_decay_style": "cosine",
79 | "warmup": 0.01,
80 | "checkpoint_factor": 10000,
81 | "eval_interval": 1000,
82 | "eval_iters": 10,
83 |
84 | # logging
85 | "log_interval": 100,
86 | "steps_per_print": 10,
87 | "keep_last_n_checkpoints": 4,
88 | "wall_clock_breakdown": true,
89 | }
90 |
--------------------------------------------------------------------------------
/configs/mamba/mamba-790M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # Parallelism is not yet supported for Mamba
3 | "pipe_parallel_size": 0,
4 | "model_parallel_size": 1,
5 |
6 | "num_layers": 48,
7 | "hidden_size": 1536,
8 | "num_attention_heads": 12, # ignored when using mamba
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "output_layer_parallelism": "column",
12 | "norm": "rmsnorm",
13 | "rms_norm_epsilon": 1.0e-5,
14 |
15 | "attention_config": [[["mamba"], 48]],
16 |
17 | "mamba_selective_scan_fusion": true,
18 | "mamba_causal_conv_fusion": true,
19 | "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 | "activation": "silu",
21 |
22 | # init methods
23 | "init_method": "small_init",
24 | "output_layer_init_method": "single_residual_scaled_normal",
25 |
26 | # optimizer settings
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.00025,
31 | "betas": [0.9, 0.999],
32 | "eps": 1.0e-8,
33 | }
34 | },
35 | "min_lr": 0.000025,
36 |
37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 | "zero_optimization": {
39 | "stage": 1,
40 | "allgather_partitions": True,
41 | "allgather_bucket_size": 500000000,
42 | "overlap_comm": True,
43 | "reduce_scatter": True,
44 | "reduce_bucket_size": 500000000,
45 | "contiguous_gradients": True,
46 | },
47 |
48 | # batch / data settings
49 | "train_micro_batch_size_per_gpu": 4,
50 | "data_impl": "mmap",
51 |
52 | # activation checkpointing
53 | "checkpoint_activations": true,
54 | "checkpoint_num_layers": 1,
55 | "partition_activations": true,
56 | "synchronize_each_layer": true,
57 |
58 | # regularization
59 | "gradient_clipping": 1.0,
60 | "weight_decay": 0.1,
61 | "hidden_dropout": 0,
62 | "attention_dropout": 0,
63 |
64 | # precision settings
65 | "fp16": {
66 | "fp16": true,
67 | "enabled": true,
68 | "loss_scale": 0,
69 | "loss_scale_window": 1000,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1
72 | },
73 |
74 | # misc. training settings
75 | "train_iters": 320000,
76 | "lr_decay_iters": 320000,
77 | "distributed_backend": "nccl",
78 | "lr_decay_style": "cosine",
79 | "warmup": 0.01,
80 | "checkpoint_factor": 10000,
81 | "eval_interval": 1000,
82 | "eval_iters": 10,
83 |
84 | # logging
85 | "log_interval": 100,
86 | "steps_per_print": 10,
87 | "keep_last_n_checkpoints": 4,
88 | "wall_clock_breakdown": true,
89 | }
90 |
--------------------------------------------------------------------------------
/configs/49M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # parallelism settings
3 | "pipe_parallel_size": 1,
4 | "model_parallel_size": 1,
5 |
6 | # model settings
7 | "num_layers": 10,
8 | "hidden_size": 640,
9 | "num_attention_heads": 10,
10 | "seq_length": 2048,
11 | "max_position_embeddings": 2048,
12 | "pos_emb": "rotary",
13 | "rotary_pct": 0.25,
14 | "no_weight_tying": true,
15 | "gpt_j_residual": true,
16 | "output_layer_parallelism": "column",
17 |
18 | # these should provide some speedup but takes a while to build, set to true if desired
19 | "scaled_upper_triang_masked_softmax_fusion": false,
20 | "bias_gelu_fusion": false,
21 | "rope_fusion": false,
22 | "layernorm_fusion": false,
23 |
24 | # init methods
25 | "init_method": "small_init",
26 | "output_layer_init_method": "wang_init",
27 |
28 | # optimizer settings
29 | "optimizer": {
30 | "type": "Adam",
31 | "params": {
32 | "lr": 0.0008,
33 | "betas": [0.9, 0.95],
34 | "eps": 1.0e-8,
35 | }
36 | },
37 | "min_lr": 0.00008,
38 |
39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 | "zero_optimization": {
41 | "stage": 1,
42 | "allgather_partitions": True,
43 | "allgather_bucket_size": 500000000,
44 | "overlap_comm": True,
45 | "reduce_scatter": True,
46 | "reduce_bucket_size": 500000000,
47 | "contiguous_gradients": True,
48 | },
49 |
50 | # batch / data settings
51 | "train_micro_batch_size_per_gpu": 32,
52 | "gradient_accumulation_steps": 1,
53 | "data_impl": "mmap",
54 | "num_workers": 1,
55 |
56 | # activation checkpointing
57 | "checkpoint_activations": true,
58 | "checkpoint_num_layers": 1,
59 | "partition_activations": true,
60 | "synchronize_each_layer": true,
61 |
62 | # regularization
63 | "gradient_clipping": 1.0,
64 | "weight_decay": 0.1,
65 | "hidden_dropout": 0,
66 | "attention_dropout": 0,
67 |
68 | # precision settings
69 | "fp16": {
70 | "fp16": true,
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "initial_scale_power": 12,
75 | "hysteresis": 2,
76 | "min_loss_scale": 1,
77 | },
78 |
79 | # misc. training settings
80 | "train_iters": 143000,
81 | "lr_decay_iters": 143000,
82 | "distributed_backend": "nccl",
83 | "lr_decay_style": "cosine",
84 | "warmup": 0.01,
85 | "checkpoint_factor": 1000,
86 | "eval_interval": 100000,
87 | "eval_iters": 10,
88 |
89 | # logging
90 | "log_interval": 10,
91 | "steps_per_print": 10,
92 | "wall_clock_breakdown": true,
93 | }
94 |
--------------------------------------------------------------------------------
/configs/bnb_125M.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 12,
10 | "hidden_size": 768,
11 | "num_attention_heads": 12,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "use_bnb_optimizer": true,
18 |
19 | # these should provide some speedup but takes a while to build, set to true if desired
20 | "scaled_upper_triang_masked_softmax_fusion": false,
21 | "bias_gelu_fusion": false,
22 | "rope_fusion": false,
23 | "layernorm_fusion": false,
24 |
25 |
26 | # optimizer settings
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.0006,
31 | "betas": [0.9, 0.999],
32 | "eps": 1.0e-8,
33 | }
34 | },
35 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36 | "zero_optimization": {
37 | "stage": 0,
38 | "allgather_partitions": True,
39 | "allgather_bucket_size": 500000000,
40 | "overlap_comm": True,
41 | "reduce_scatter": True,
42 | "reduce_bucket_size": 500000000,
43 | "contiguous_gradients": True,
44 | },
45 |
46 | # batch / data settings
47 | "train_micro_batch_size_per_gpu": 4,
48 | "data_impl": "mmap",
49 | "split": "949,50,1",
50 |
51 | # activation checkpointing
52 | "checkpoint_activations": true,
53 | "checkpoint_num_layers": 1,
54 | "partition_activations": true,
55 | "synchronize_each_layer": true,
56 |
57 | # regularization
58 | "gradient_clipping": 1.0,
59 | "weight_decay": 0.0,
60 | "hidden_dropout": 0.0,
61 | "attention_dropout": 0.0,
62 |
63 | # precision settings
64 | "fp16": {
65 | "enabled": true,
66 | "loss_scale": 0,
67 | "loss_scale_window": 1000,
68 | "hysteresis": 2,
69 | "min_loss_scale": 1
70 | },
71 |
72 | # misc. training settings
73 | "train_iters": 320000,
74 | "lr_decay_iters": 320000,
75 | "distributed_backend": "nccl",
76 | "lr_decay_style": "cosine",
77 | "warmup": 0.01,
78 | "checkpoint_factor": 10000,
79 | "eval_interval": 1000,
80 | "eval_iters": 10,
81 |
82 | # logging
83 | "log_interval": 100,
84 | "steps_per_print": 10,
85 | "keep_last_n_checkpoints": 4,
86 | "wall_clock_breakdown": true,
87 | }
88 |
--------------------------------------------------------------------------------
/configs/19M.yml:
--------------------------------------------------------------------------------
1 | {
2 | "pipe_parallel_size": 1,
3 | "model_parallel_size": 1,
4 |
5 | # model settings
6 | "num_layers": 6,
7 | "hidden_size": 512,
8 | "num_attention_heads": 8,
9 | "seq_length": 2048,
10 | "max_position_embeddings": 2048,
11 | "pos_emb": "rotary",
12 | "no_weight_tying": true,
13 | "gpt_j_residual": false,
14 | "output_layer_parallelism": "column",
15 |
16 | "scaled_upper_triang_masked_softmax_fusion": false,
17 | "bias_gelu_fusion": false,
18 | "rope_fusion": false,
19 | "layernorm_fusion": false,
20 |
21 | # init methods
22 | "init_method": "small_init",
23 | "output_layer_init_method": "wang_init",
24 |
25 | "optimizer": {
26 | "type": "Adam",
27 | "params": {
28 | "lr": 0.001,
29 | "betas": [0.9, 0.95],
30 | "eps": 1.0e-8,
31 | }
32 | },
33 | "min_lr": 0.0001,
34 |
35 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36 | "zero_optimization": {
37 | "stage": 1,
38 | "allgather_partitions": True,
39 | "allgather_bucket_size": 500000000,
40 | "overlap_comm": True,
41 | "reduce_scatter": True,
42 | "reduce_bucket_size": 500000000,
43 | "contiguous_gradients": True,
44 | },
45 |
46 | "train_micro_batch_size_per_gpu": 4, #32,
47 | "gradient_accumulation_steps": 1,
48 | "data_impl": "mmap",
49 | "num_workers": 1,
50 |
51 | # activation checkpointing
52 | "checkpoint_activations": true,
53 | "checkpoint_num_layers": 1,
54 | "partition_activations": true,
55 | "synchronize_each_layer": true,
56 |
57 | # regularization
58 | "gradient_clipping": 1.0,
59 | "weight_decay": 0.1,
60 | "hidden_dropout": 0,
61 | "attention_dropout": 0,
62 |
63 | # precision settings
64 | "fp16": {
65 | "fp16": true,
66 | "enabled": true,
67 | "loss_scale": 0,
68 | "loss_scale_window": 1000,
69 | "initial_scale_power": 12,
70 | "hysteresis": 2,
71 | "min_loss_scale": 1,
72 | },
73 |
74 | "train_iters": 143000,
75 | "lr_decay_iters": 143000,
76 | "distributed_backend": "nccl",
77 | "lr_decay_style": "cosine",
78 | "warmup": 0.01,
79 | "checkpoint_factor": 1000,
80 | "eval_interval": 100000,
81 | "eval_iters": 10,
82 |
83 | "log_interval": 10,
84 | "steps_per_print": 10,
85 | "wall_clock_breakdown": true,
86 |
87 | # additional deepspeed args not specified above
88 | "deepspeed_extra_args": {
89 | "comms_logger": {
90 | "enabled": true,
91 | "verbose": true,
92 | "prof_all": true,
93 | "debug": false
94 | },
95 | }
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from tools.datasets.corpora import prepare_dataset, DATA_DOWNLOADERS
16 | import argparse
17 |
18 | TOKENIZER_CHOICES = [
19 | "HFGPT2Tokenizer",
20 | "HFTokenizer",
21 | "GPT2BPETokenizer",
22 | "CharLevelTokenizer",
23 | "TiktokenTokenizer",
24 | "SPMTokenizer",
25 | ]
26 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"]
27 |
28 |
29 | def get_args():
30 | parser = argparse.ArgumentParser(description="Download & preprocess neox datasets")
31 | parser.add_argument(
32 | "dataset",
33 | nargs="?",
34 | default="enwik8",
35 | help="name of dataset to download.",
36 | choices=DATASET_CHOICES,
37 | )
38 | parser.add_argument(
39 | "-t",
40 | "--tokenizer",
41 | default="GPT2BPETokenizer",
42 | choices=TOKENIZER_CHOICES,
43 | help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}',
44 | )
45 | parser.add_argument(
46 | "-d",
47 | "--data-dir",
48 | default=None,
49 | help=f"Directory to which to download datasets / tokenizer "
50 | f"files - defaults to ./data",
51 | )
52 | parser.add_argument(
53 | "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)"
54 | )
55 | parser.add_argument(
56 | "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)"
57 | )
58 | parser.add_argument(
59 | "-f",
60 | "--force-redownload",
61 | dest="force_redownload",
62 | default=False,
63 | action="store_true",
64 | )
65 | return parser.parse_args()
66 |
67 |
68 | if __name__ == "__main__":
69 | args = get_args()
70 | prepare_dataset(
71 | dataset_name=args.dataset,
72 | tokenizer_type=args.tokenizer,
73 | data_dir=args.data_dir,
74 | vocab_file=args.vocab_file,
75 | merge_file=args.merge_file,
76 | force_redownload=args.force_redownload,
77 | )
78 |
--------------------------------------------------------------------------------
/tools/datasets/merge_datasets.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import argparse
5 |
6 | sys.path.append(
7 | os.path.abspath(
8 | os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
9 | )
10 | )
11 |
12 | from megatron.data import indexed_dataset
13 |
14 |
15 | def main(args):
16 |
17 | prefixes = set()
18 | for basename in os.listdir(args.input):
19 | prefix, ext = os.path.splitext(basename)
20 |
21 | if prefix in prefixes:
22 | continue
23 |
24 | if not os.path.isfile(os.path.join(args.input, basename)):
25 | continue
26 |
27 | ext_pair = ".bin" if ext == ".idx" else ".idx"
28 | assert os.path.isfile(
29 | os.path.join(args.input, prefix) + ext_pair
30 | ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
31 |
32 | prefixes.add(prefix)
33 |
34 | builder = None
35 | for prefix in sorted(prefixes):
36 | if builder is None:
37 | dataset = indexed_dataset.make_dataset(
38 | os.path.join(args.input, prefix), "infer"
39 | )
40 |
41 | if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
42 | builder = indexed_dataset.MMapIndexedDatasetBuilder(
43 | args.output_prefix + ".bin", dtype=dataset._index.dtype
44 | )
45 | else:
46 | builder = indexed_dataset.IndexedDatasetBuilder(
47 | args.output_prefix + ".bin"
48 | )
49 |
50 | del dataset
51 |
52 | builder.merge_file_(os.path.join(args.input, prefix))
53 |
54 | builder.finalize(args.output_prefix + ".idx")
55 |
56 |
57 | if __name__ == "__main__":
58 | parser = argparse.ArgumentParser()
59 |
60 | group = parser.add_argument_group(title="input data")
61 | group.add_argument(
62 | "--input",
63 | type=str,
64 | required=True,
65 | help="Path to directory containing all document files to merge",
66 | )
67 |
68 | group = parser.add_argument_group(title="output data")
69 | group.add_argument(
70 | "--output-prefix",
71 | type=str,
72 | required=True,
73 | help="Path to binary output file without suffix",
74 | )
75 |
76 | args = parser.parse_args()
77 |
78 | assert os.path.isdir(
79 | args.input
80 | ), f"ERROR: {args.input} is not a directory or does not exist"
81 |
82 | assert os.path.isdir(
83 | os.path.dirname(args.output_prefix)
84 | ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
85 |
86 | main(args)
87 |
--------------------------------------------------------------------------------
/configs/pythia/31M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # parallelism settings
3 | "pipe-parallel-size": 0,
4 | "model-parallel-size": 1,
5 |
6 | # model settings
7 | "num-layers": 6,
8 | "hidden-size": 256,
9 | "num-attention-heads": 8,
10 | "seq-length": 2048,
11 | "max-position-embeddings": 2048,
12 | "pos-emb": "rotary",
13 | "rotary-pct": 0.25,
14 | "no-weight-tying": true,
15 | "gpt-j-residual": true,
16 | "output-layer-parallelism": "column",
17 |
18 | "attention-config": [[["flash"], 6]],
19 |
20 | "scaled-upper-triang-masked-softmax-fusion": true,
21 | "bias-gelu-fusion": true,
22 |
23 | # init methods
24 | "init_method": "small_init",
25 | "output_layer_init_method": "wang_init",
26 |
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.001,
31 | "betas": [0.9, 0.95],
32 | "eps": 1.0e-8
33 | }
34 | },
35 | "min_lr": 0.0001,
36 |
37 | "zero_optimization": {
38 | "stage": 0,
39 | "allgather_partitions": true,
40 | "allgather_bucket_size": 500000000,
41 | "overlap_comm": true,
42 | "reduce_scatter": true,
43 | "reduce_bucket_size": 500000000,
44 | "contiguous_gradients": true,
45 | "cpu_offload": false
46 | },
47 |
48 | # batch size (trained on 32 gpus)
49 | "train_micro_batch_size_per_gpu": 32,
50 | "data-impl": "mmap",
51 | "num_workers": 2,
52 |
53 | # activation checkpointing
54 | "checkpoint-activations": false,
55 | "checkpoint-num-layers": 1,
56 | "partition-activations": false,
57 | "synchronize-each-layer": true,
58 |
59 | # regularization
60 | "gradient_clipping": 1.0,
61 | "weight-decay": 0.1,
62 | "hidden-dropout": 0,
63 | "attention-dropout": 0,
64 |
65 | # precision settings
66 | "fp16": {
67 | "fp16": true,
68 | "enabled": true,
69 | "loss_scale": 0,
70 | "loss_scale_window": 1000,
71 | "initial_scale_power": 12,
72 | "hysteresis": 2,
73 | "min_loss_scale": 1
74 | },
75 |
76 | "train-iters": 143000,
77 | "lr-decay-iters": 143000,
78 | "distributed-backend": "nccl",
79 | "lr-decay-style": "cosine",
80 | "warmup": 0.01,
81 | "checkpoint-factor": 1000,
82 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
83 | "eval-interval": 100000,
84 | "eval-iters": 10,
85 | "log-interval": 10,
86 | "steps_per_print": 10,
87 | "wall_clock_breakdown": true,
88 |
89 | "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
90 | "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
91 | "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
92 |
93 | "tokenizer-type": "HFTokenizer",
94 | "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json"
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/configs/pythia/14M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # parallelism settings
3 | "pipe-parallel-size": 0,
4 | "model-parallel-size": 1,
5 |
6 | # model settings
7 | "num-layers": 6,
8 | "hidden-size": 128,
9 | "num-attention-heads": 4,
10 | "seq-length": 2048,
11 | "max-position-embeddings": 2048,
12 | "pos-emb": "rotary",
13 | "rotary-pct": 0.25,
14 | "no-weight-tying": true,
15 | "gpt-j-residual": true,
16 | "output-layer-parallelism": "column",
17 |
18 | "attention-config": [[["flash"], 6]],
19 |
20 | "scaled-upper-triang-masked-softmax-fusion": true,
21 | "bias-gelu-fusion": true,
22 |
23 | # init methods
24 | "init_method": "small_init",
25 | "output_layer_init_method": "wang_init",
26 |
27 | "optimizer": {
28 | "type": "Adam",
29 | "params": {
30 | "lr": 0.001,
31 | "betas": [0.9, 0.95],
32 | "eps": 1.0e-8
33 | }
34 | },
35 | "min_lr": 0.0001,
36 |
37 | "zero_optimization": {
38 | "stage": 0,
39 | "allgather_partitions": true,
40 | "allgather_bucket_size": 50000000,
41 | "overlap_comm": true,
42 | "reduce_scatter": true,
43 | "reduce_bucket_size": 50000000,
44 | "contiguous_gradients": true,
45 | "cpu_offload": false
46 | },
47 |
48 | # batch size (trained on 32 gpus)
49 | "train_micro_batch_size_per_gpu": 32,
50 | "data-impl": "mmap",
51 | "num_workers": 4,
52 |
53 | # activation checkpointing
54 | "checkpoint-activations": false, #true,
55 | "checkpoint-num-layers": 1,
56 | "partition-activations": false, #true,
57 | "synchronize-each-layer": true,
58 |
59 | # regularization
60 | "gradient_clipping": 1.0,
61 | "weight-decay": 0.1,
62 | "hidden-dropout": 0,
63 | "attention-dropout": 0,
64 |
65 | # precision settings
66 | "fp16": {
67 | "fp16": true,
68 | "enabled": true,
69 | "loss_scale": 0,
70 | "loss_scale_window": 1000,
71 | "initial_scale_power": 12,
72 | "hysteresis": 2,
73 | "min_loss_scale": 1
74 | },
75 |
76 | "train-iters": 143000,
77 | "lr-decay-iters": 143000,
78 | "distributed-backend": "nccl",
79 | "lr-decay-style": "cosine",
80 | "warmup": 0.01,
81 | "checkpoint-factor": 1000,
82 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
83 | "eval-interval": 100000,
84 | "eval-iters": 10,
85 |
86 | "log-interval": 10,
87 | "steps_per_print": 10,
88 | "wall_clock_breakdown": true,
89 |
90 | "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
91 | "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
92 | "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
93 |
94 | "tokenizer-type": "HFTokenizer",
95 | "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json"
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/configs/175B.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 96,
10 | "hidden_size": 12288,
11 | "num_attention_heads": 96,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.00006,
35 | "betas": [0.9, 0.95],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 | "min_lr": 0.000006,
40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 | "zero_optimization": {
42 | "stage": 1,
43 | "allgather_partitions": True,
44 | "allgather_bucket_size": 500000000,
45 | "overlap_comm": True,
46 | "reduce_scatter": True,
47 | "reduce_bucket_size": 500000000,
48 | "contiguous_gradients": True,
49 | },
50 |
51 | # batch / data settings
52 | "train_micro_batch_size_per_gpu": 4,
53 | "data_impl": "mmap",
54 |
55 | # activation checkpointing
56 | "checkpoint_activations": true,
57 | "checkpoint_num_layers": 1,
58 | "partition_activations": true,
59 | "synchronize_each_layer": true,
60 |
61 | # regularization
62 | "gradient_clipping": 1.0,
63 | "weight_decay": 0.1,
64 | "hidden_dropout": 0,
65 | "attention_dropout": 0,
66 |
67 | # precision settings
68 | "fp16": {
69 | "fp16": true,
70 | "enabled": true,
71 | "loss_scale": 0,
72 | "loss_scale_window": 1000,
73 | "hysteresis": 2,
74 | "min_loss_scale": 1
75 | },
76 |
77 | # misc. training settings
78 | "train_iters": 320000,
79 | "lr_decay_iters": 320000,
80 | "distributed_backend": "nccl",
81 | "lr_decay_style": "cosine",
82 | "warmup": 0.01,
83 | "checkpoint_factor": 10000,
84 | "eval_interval": 1000,
85 | "eval_iters": 10,
86 |
87 | # logging
88 | "log_interval": 100,
89 | "steps_per_print": 10,
90 | "keep_last_n_checkpoints": 4,
91 | "wall_clock_breakdown": true,
92 | }
93 |
--------------------------------------------------------------------------------
/configs/350M.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 24,
10 | "hidden_size": 1024,
11 | "num_attention_heads": 16,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.0003,
35 | "betas": [0.9, 0.95],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 | "min_lr": 0.00003,
40 |
41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 | "zero_optimization": {
43 | "stage": 1,
44 | "allgather_partitions": True,
45 | "allgather_bucket_size": 500000000,
46 | "overlap_comm": True,
47 | "reduce_scatter": True,
48 | "reduce_bucket_size": 500000000,
49 | "contiguous_gradients": True,
50 | },
51 | # batch / data settings
52 | "train_micro_batch_size_per_gpu": 4,
53 | "data_impl": "mmap",
54 |
55 | # activation checkpointing
56 | "checkpoint_activations": true,
57 | "checkpoint_num_layers": 1,
58 | "partition_activations": true,
59 | "synchronize_each_layer": true,
60 |
61 | # regularization
62 | "gradient_clipping": 1.0,
63 | "weight_decay": 0.1,
64 | "hidden_dropout": 0,
65 | "attention_dropout": 0,
66 |
67 | # precision settings
68 | "fp16": {
69 | "fp16": true,
70 | "enabled": true,
71 | "loss_scale": 0,
72 | "loss_scale_window": 1000,
73 | "hysteresis": 2,
74 | "min_loss_scale": 1
75 | },
76 |
77 | # misc. training settings
78 | "train_iters": 320000,
79 | "lr_decay_iters": 320000,
80 | "distributed_backend": "nccl",
81 | "lr_decay_style": "cosine",
82 | "warmup": 0.01,
83 | "checkpoint_factor": 10000,
84 | "eval_interval": 1000,
85 | "eval_iters": 10,
86 |
87 | # logging
88 | "log_interval": 100,
89 | "steps_per_print": 10,
90 | "keep_last_n_checkpoints": 4,
91 | "wall_clock_breakdown": true,
92 | }
93 |
--------------------------------------------------------------------------------
/configs/1-3B.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 24,
10 | "hidden_size": 2048,
11 | "num_attention_heads": 16,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.0002,
35 | "betas": [0.9, 0.95],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 | "min_lr": 0.00002,
40 |
41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 | "zero_optimization": {
43 | "stage": 1,
44 | "allgather_partitions": True,
45 | "allgather_bucket_size": 500000000,
46 | "overlap_comm": True,
47 | "reduce_scatter": True,
48 | "reduce_bucket_size": 500000000,
49 | "contiguous_gradients": True,
50 | },
51 |
52 | # batch / data settings
53 | "train_micro_batch_size_per_gpu": 4,
54 | "data_impl": "mmap",
55 |
56 | # activation checkpointing
57 | "checkpoint_activations": true,
58 | "checkpoint_num_layers": 1,
59 | "partition_activations": true,
60 | "synchronize_each_layer": true,
61 |
62 | # regularization
63 | "gradient_clipping": 1.0,
64 | "weight_decay": 0.1,
65 | "hidden_dropout": 0,
66 | "attention_dropout": 0,
67 |
68 | # precision settings
69 | "fp16": {
70 | "fp16": true,
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train_iters": 320000,
80 | "lr_decay_iters": 320000,
81 | "distributed_backend": "nccl",
82 | "lr_decay_style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint_factor": 10000,
85 | "eval_interval": 1000,
86 | "eval_iters": 10,
87 |
88 | # logging
89 | "log_interval": 100,
90 | "steps_per_print": 10,
91 | "keep_last_n_checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 | }
94 |
--------------------------------------------------------------------------------
/configs/2-7B.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 32,
10 | "hidden_size": 2560,
11 | "num_attention_heads": 32,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.00016,
35 | "betas": [0.9, 0.95],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 | "min_lr": 0.000016,
40 |
41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 | "zero_optimization": {
43 | "stage": 1,
44 | "allgather_partitions": True,
45 | "allgather_bucket_size": 500000000,
46 | "overlap_comm": True,
47 | "reduce_scatter": True,
48 | "reduce_bucket_size": 500000000,
49 | "contiguous_gradients": True,
50 | },
51 |
52 | # batch / data settings
53 | "train_micro_batch_size_per_gpu": 4,
54 | "data_impl": "mmap",
55 |
56 | # activation checkpointing
57 | "checkpoint_activations": true,
58 | "checkpoint_num_layers": 1,
59 | "partition_activations": true,
60 | "synchronize_each_layer": true,
61 |
62 | # regularization
63 | "gradient_clipping": 1.0,
64 | "weight_decay": 0.1,
65 | "hidden_dropout": 0,
66 | "attention_dropout": 0,
67 |
68 | # precision settings
69 | "fp16": {
70 | "fp16": true,
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train_iters": 320000,
80 | "lr_decay_iters": 320000,
81 | "distributed_backend": "nccl",
82 | "lr_decay_style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint_factor": 10000,
85 | "eval_interval": 1000,
86 | "eval_iters": 10,
87 |
88 | # logging
89 | "log_interval": 100,
90 | "steps_per_print": 10,
91 | "keep_last_n_checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 | }
94 |
--------------------------------------------------------------------------------
/configs/6-7B.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 32,
10 | "hidden_size": 4096,
11 | "num_attention_heads": 32,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.00012,
35 | "betas": [0.9, 0.95],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 |
40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 | "zero_optimization": {
42 | "stage": 1,
43 | "allgather_partitions": True,
44 | "allgather_bucket_size": 500000000,
45 | "overlap_comm": True,
46 | "reduce_scatter": True,
47 | "reduce_bucket_size": 500000000,
48 | "contiguous_gradients": True,
49 | },
50 | "min_lr": 0.000012,
51 |
52 | # batch / data settings
53 | "train_micro_batch_size_per_gpu": 4,
54 | "data_impl": "mmap",
55 |
56 | # activation checkpointing
57 | "checkpoint_activations": true,
58 | "checkpoint_num_layers": 1,
59 | "partition_activations": true,
60 | "synchronize_each_layer": true,
61 |
62 | # regularization
63 | "gradient_clipping": 1.0,
64 | "weight_decay": 0.1,
65 | "hidden_dropout": 0,
66 | "attention_dropout": 0,
67 |
68 | # precision settings
69 | "fp16": {
70 | "fp16": true,
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train_iters": 320000,
80 | "lr_decay_iters": 320000,
81 | "distributed_backend": "nccl",
82 | "lr_decay_style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint_factor": 10000,
85 | "eval_interval": 1000,
86 | "eval_iters": 10,
87 |
88 | # logging
89 | "log_interval": 100,
90 | "steps_per_print": 10,
91 | "keep_last_n_checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 | }
94 |
--------------------------------------------------------------------------------
/configs/13B.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 40,
10 | "hidden_size": 5120,
11 | "num_attention_heads": 40,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 |
31 | # optimizer settings
32 | "optimizer": {
33 | "type": "Adam",
34 | "params": {
35 | "lr": 0.0001,
36 | "betas": [0.9, 0.95],
37 | "eps": 1.0e-8,
38 | }
39 | },
40 |
41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 | "zero_optimization": {
43 | "stage": 1,
44 | "allgather_partitions": True,
45 | "allgather_bucket_size": 500000000,
46 | "overlap_comm": True,
47 | "reduce_scatter": True,
48 | "reduce_bucket_size": 500000000,
49 | "contiguous_gradients": True,
50 | },
51 | "min_lr": 0.00001,
52 |
53 | # batch / data settings
54 | "train_micro_batch_size_per_gpu": 4,
55 | "data_impl": "mmap",
56 |
57 | # activation checkpointing
58 | "checkpoint_activations": true,
59 | "checkpoint_num_layers": 1,
60 | "partition_activations": true,
61 | "synchronize_each_layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight_decay": 0.1,
66 | "hidden_dropout": 0,
67 | "attention_dropout": 0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "fp16": true,
72 | "enabled": true,
73 | "loss_scale": 0,
74 | "loss_scale_window": 1000,
75 | "hysteresis": 2,
76 | "min_loss_scale": 1
77 | },
78 |
79 | # misc. training settings
80 | "train_iters": 320000,
81 | "lr_decay_iters": 320000,
82 | "distributed_backend": "nccl",
83 | "lr_decay_style": "cosine",
84 | "warmup": 0.01,
85 | "checkpoint_factor": 10000,
86 | "eval_interval": 1000,
87 | "eval_iters": 10,
88 |
89 | # logging
90 | "log_interval": 100,
91 | "steps_per_print": 10,
92 | "keep_last_n_checkpoints": 4,
93 | "wall_clock_breakdown": true,
94 | }
95 |
--------------------------------------------------------------------------------
/configs/760M.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 24,
10 | "hidden_size": 1536,
11 | "num_attention_heads": 16,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.00025,
35 | "betas": [0.9, 0.999],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 | "min_lr": 0.000025,
40 |
41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 | "zero_optimization": {
43 | "stage": 1,
44 | "allgather_partitions": True,
45 | "allgather_bucket_size": 500000000,
46 | "overlap_comm": True,
47 | "reduce_scatter": True,
48 | "reduce_bucket_size": 500000000,
49 | "contiguous_gradients": True,
50 | },
51 |
52 | # batch / data settings
53 | "train_micro_batch_size_per_gpu": 4,
54 | "data_impl": "mmap",
55 |
56 | # activation checkpointing
57 | "checkpoint_activations": true,
58 | "checkpoint_num_layers": 1,
59 | "partition_activations": true,
60 | "synchronize_each_layer": true,
61 |
62 | # regularization
63 | "gradient_clipping": 1.0,
64 | "weight_decay": 0.1,
65 | "hidden_dropout": 0,
66 | "attention_dropout": 0,
67 |
68 | # precision settings
69 | "fp16": {
70 | "fp16": true,
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train_iters": 320000,
80 | "lr_decay_iters": 320000,
81 | "distributed_backend": "nccl",
82 | "lr_decay_style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint_factor": 10000,
85 | "eval_interval": 1000,
86 | "eval_iters": 10,
87 |
88 | # logging
89 | "log_interval": 100,
90 | "steps_per_print": 10,
91 | "keep_last_n_checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 | }
94 |
--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Model parallel utility interface."""
16 |
17 | from .cross_entropy import vocab_parallel_cross_entropy
18 |
19 | from .data import broadcast_data
20 |
21 | from .initialize import is_unitialized
22 | from .initialize import destroy_model_parallel
23 | from .initialize import get_data_parallel_group
24 | from .initialize import get_data_parallel_rank
25 | from .initialize import get_data_parallel_world_size
26 | from .initialize import get_model_parallel_group
27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
30 | from .initialize import get_topology
31 | from .initialize import get_pipe_parallel_group
32 | from .initialize import get_pipe_parallel_rank
33 | from .initialize import get_pipe_parallel_world_size
34 | from .initialize import get_tensor_model_parallel_group
35 | from .initialize import get_tensor_model_parallel_rank
36 | from .initialize import get_tensor_model_parallel_world_size
37 | from .initialize import get_io_parallel_group
38 | from .initialize import initialize_model_parallel
39 | from .initialize import model_parallel_is_initialized
40 |
41 | from .layers import ColumnParallelLinear
42 | from .layers import RowParallelLinear
43 | from .layers import VocabParallelEmbedding
44 | from .layers import ParallelRelativePositionBias
45 |
46 | from .mappings import copy_to_model_parallel_region
47 | from .mappings import gather_from_model_parallel_region
48 | from .mappings import reduce_from_model_parallel_region
49 | from .mappings import scatter_to_model_parallel_region
50 | from .mappings import reduce_scatter_to_sequence_parallel_region
51 | from .mappings import gather_from_sequence_parallel_region
52 | from .mappings import scatter_to_sequence_parallel_region
53 |
54 | from .random import checkpoint
55 | from .random import get_cuda_rng_tracker
56 | from .random import model_parallel_cuda_manual_seed
57 |
58 | from .utils import divide
59 | from .utils import split_tensor_along_last_dim
60 |
--------------------------------------------------------------------------------
/configs/125M.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe_parallel_size": 1,
6 | "model_parallel_size": 1,
7 |
8 | # model settings
9 | "num_layers": 12,
10 | "hidden_size": 768,
11 | "num_attention_heads": 12,
12 | "seq_length": 2048,
13 | "max_position_embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos_emb": "rotary",
16 | "no_weight_tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled_upper_triang_masked_softmax_fusion": false,
22 | "bias_gelu_fusion": false,
23 | "rope_fusion": false,
24 | "layernorm_fusion": false,
25 |
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 |
31 | # optimizer settings
32 | "optimizer": {
33 | "type": "Adam",
34 | "params": {
35 | "lr": 0.0006,
36 | "betas": [0.9, 0.95],
37 | "eps": 1.0e-8,
38 | }
39 | },
40 | "min_lr": 0.00006,
41 |
42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43 | "zero_optimization": {
44 | "stage": 1,
45 | "allgather_partitions": True,
46 | "allgather_bucket_size": 500000000,
47 | "overlap_comm": True,
48 | "reduce_scatter": True,
49 | "reduce_bucket_size": 500000000,
50 | "contiguous_gradients": True,
51 | },
52 |
53 | # batch / data settings
54 | "train_micro_batch_size_per_gpu": 4,
55 | "data_impl": "mmap",
56 |
57 | # activation checkpointing
58 | "checkpoint_activations": true,
59 | "checkpoint_num_layers": 1,
60 | "partition_activations": true,
61 | "synchronize_each_layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight_decay": 0.1,
66 | "hidden_dropout": 0.0,
67 | "attention_dropout": 0.0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train_iters": 320000,
80 | "lr_decay_iters": 320000,
81 | "distributed_backend": "nccl",
82 | "lr_decay_style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint_factor": 10000,
85 | "eval_interval": 1000,
86 | "eval_iters": 10,
87 |
88 | # logging
89 | "log_interval": 100,
90 | "steps_per_print": 10,
91 | "keep_last_n_checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 |
94 | # networking
95 | "hostfile": "/mock_path"
96 | }
97 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_cope.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "cope",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | # init methods
25 | "init_method": "small_init",
26 | "output_layer_init_method": "wang_init",
27 |
28 |
29 | # optimizer settings
30 | "optimizer": {
31 | "type": "Adam",
32 | "params": {
33 | "lr": 0.0006,
34 | "betas": [0.9, 0.95],
35 | "eps": 1.0e-8,
36 | }
37 | },
38 | "min_lr": 0.00006,
39 |
40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 | "zero_optimization": {
42 | "stage": 1,
43 | "allgather_partitions": True,
44 | "allgather_bucket_size": 500000000,
45 | "overlap_comm": True,
46 | "reduce_scatter": True,
47 | "reduce_bucket_size": 500000000,
48 | "contiguous_gradients": True,
49 | },
50 |
51 | # batch / data settings
52 | "train_micro_batch_size_per_gpu": 32,
53 | "data-impl": "mmap",
54 |
55 | # activation checkpointing
56 | "checkpoint-activations": true,
57 | "checkpoint-num-layers": 1,
58 | "partition-activations": true,
59 | "synchronize-each-layer": true,
60 |
61 | # regularization
62 | "gradient_clipping": 1.0,
63 | "weight-decay": 0.1,
64 | "hidden-dropout": 0.0,
65 | "attention-dropout": 0.0,
66 |
67 | # precision settings
68 | "fp16": {
69 | "enabled": true,
70 | "loss_scale": 0,
71 | "loss_scale_window": 1000,
72 | "hysteresis": 2,
73 | "min_loss_scale": 1
74 | },
75 |
76 | # misc. training settings
77 | "train-iters": 50000,
78 | "lr-decay-iters": 50000,
79 | "distributed-backend": "nccl",
80 | "lr-decay-style": "cosine",
81 | "warmup": 0.01,
82 | "checkpoint-factor": 10000,
83 | "eval-interval": 5000,
84 | "eval-iters": 20,
85 |
86 | # logging
87 | "log-interval": 100,
88 | "steps_per_print": 10,
89 | "keep-last-n-checkpoints": 4,
90 | "wall_clock_breakdown": true,
91 |
92 | # networking
93 | "hostfile": "/mock_path",
94 | "save": "checkpoints/125M_cope",
95 | "load": "checkpoints/125M_cope",
96 | "tensorboard-dir": "tensorboard/125M_cope",
97 | "log-dir": "logs/125M_cope",
98 | }
99 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # wandb logs
132 | wandb/
133 |
134 | # data files
135 | data/**/*.idx
136 | data/**/*.bin
137 | data/**/*.json*
138 | data/**/*.txt
139 | data/**/*.gz
140 | data/**/*.zip
141 | data/**/*.np*
142 | data/**/*.npy
143 | checkpoints/
144 | .vscode/
145 | *.pt
146 | *.ckpt
147 |
148 | #test logs
149 | test_checkpoint/
150 | test_logs/
151 | logs/
152 | tensorboard/
153 | src/
154 |
155 | # test data files
156 | tests/data/*.bin
157 | tests/data/*.idx
158 |
--------------------------------------------------------------------------------
/configs/rwkv/170M.yml:
--------------------------------------------------------------------------------
1 | {
2 | # Parallelism is not yet supported for rwkv
3 | "pipe_parallel_size": 1,
4 | "model_parallel_size": 1,
5 |
6 | "num_layers": 12,
7 | "hidden_size": 768,
8 | "num_attention_heads": 12, # head_size = dim_att / num_attention_heads.
9 | # head_size is 64 for all rwkv models
10 | "seq_length": 512,
11 | "max_position_embeddings": 2048,
12 | "output_layer_parallelism": "column",
13 | "norm": "rmsnorm",
14 | "rms_norm_epsilon": 1.0e-5,
15 | "train_micro_batch_size_per_gpu": 32,
16 |
17 | "attention_config": [[["rwkv"], 12]],
18 |
19 | "activation": "silu",
20 |
21 | # model settings
22 |
23 | #"pos_emb": "rotary",
24 | "rotary_pct": 0.25,
25 | "no_weight_tying": true,
26 | "gpt_j_residual": true,
27 |
28 | # these should provide some speedup but takes a while to build, set to true if desired
29 | "scaled_upper_triang_masked_softmax_fusion": false,
30 | "bias_gelu_fusion": false,
31 | "rope_fusion": false,
32 | "layernorm_fusion": false,
33 |
34 |
35 | # init methods
36 | "init_method": "small_init",
37 | "output_layer_init_method": "wang_init",
38 |
39 | # optimizer settings
40 | "optimizer": {
41 | "type": "Adam",
42 | "params": {
43 | "lr": 0.0008,
44 | "betas": [0.9, 0.95],
45 | "eps": 1.0e-8,
46 | }
47 | },
48 | "min_lr": 0.00008,
49 |
50 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
51 | "zero_optimization": {
52 | "stage": 1,
53 | "allgather_partitions": True,
54 | "allgather_bucket_size": 500000000,
55 | "overlap_comm": True,
56 | "reduce_scatter": True,
57 | "reduce_bucket_size": 500000000,
58 | "contiguous_gradients": True,
59 | },
60 |
61 | # batch / data settings
62 | "data_impl": "mmap",
63 | "num_workers": 1,
64 |
65 | # activation checkpointing
66 | "checkpoint_activations": true,
67 | "checkpoint_num_layers": 1,
68 | "partition_activations": true,
69 | "synchronize_each_layer": true,
70 |
71 | # regularization
72 | "gradient_clipping": 1.0,
73 | "weight_decay": 0.1,
74 | "hidden_dropout": 0,
75 | "attention_dropout": 0,
76 |
77 | # precision settings
78 | "bf16": {
79 | "bf16": true,
80 | "enabled": true,
81 | "loss_scale": 0,
82 | "loss_scale_window": 1000,
83 | "initial_scale_power": 12,
84 | "hysteresis": 2,
85 | "min_loss_scale": 1,
86 | },
87 |
88 | # misc. training settings
89 | "train_iters": 500,
90 | "lr_decay_iters": 500,
91 | "distributed_backend": "nccl",
92 | "lr_decay_style": "constant",
93 | "warmup": 0.01,
94 | "checkpoint_factor": 100,
95 | "eval_interval": 100000,
96 | "eval_iters": 10,
97 |
98 | # logging
99 | "log_interval": 10,
100 | "steps_per_print": 10,
101 | "wall_clock_breakdown": true,
102 | }
103 |
--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | # This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """Blendable dataset."""
19 |
20 | import time
21 |
22 | import numpy as np
23 | import torch
24 |
25 | from megatron import print_rank_0
26 | from megatron import mpu
27 |
28 |
29 | class BlendableDataset(torch.utils.data.Dataset):
30 | def __init__(self, datasets, weights):
31 | self.datasets = datasets
32 | num_datasets = len(datasets)
33 | assert num_datasets == len(weights)
34 |
35 | self.size = 0
36 | for dataset in self.datasets:
37 | self.size += len(dataset)
38 |
39 | # Normalize weights.
40 | weights = np.array(weights, dtype=np.float64)
41 | sum_weights = np.sum(weights)
42 | assert sum_weights > 0.0
43 | weights /= sum_weights
44 |
45 | # Build indices.
46 | start_time = time.time()
47 | assert num_datasets < 255
48 | self.dataset_index = np.zeros(self.size, dtype=np.uint8)
49 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
50 |
51 | from megatron.data import helpers
52 |
53 | helpers.build_blending_indices(
54 | self.dataset_index,
55 | self.dataset_sample_index,
56 | weights,
57 | num_datasets,
58 | self.size,
59 | torch.distributed.get_rank() == 0,
60 | )
61 |
62 | print(
63 | "> RANK {} elapsed time for building blendable dataset indices: "
64 | "{:.2f} (sec)".format(
65 | torch.distributed.get_rank(), time.time() - start_time
66 | )
67 | )
68 |
69 | def __len__(self):
70 | return self.size
71 |
72 | def __getitem__(self, idx):
73 | try:
74 | dataset_idx = self.dataset_index[idx]
75 | sample_idx = self.dataset_sample_index[idx]
76 | return self.datasets[dataset_idx][sample_idx]
77 | except IndexError:
78 | new_idx = idx % len(self)
79 | print(
80 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
81 | )
82 | return self[new_idx]
83 |
--------------------------------------------------------------------------------
/megatron/model/rwkv/v6/cuda/wkv6_op.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "ATen/ATen.h"
3 | typedef at::BFloat16 bf16;
4 |
5 | void cuda_forward(int B,
6 | int T,
7 | int C,
8 | int H,
9 | bf16* r,
10 | bf16* k,
11 | bf16* v,
12 | float* w,
13 | bf16* u,
14 | bf16* y);
15 | void cuda_backward(int B,
16 | int T,
17 | int C,
18 | int H,
19 | bf16* r,
20 | bf16* k,
21 | bf16* v,
22 | float* w,
23 | bf16* u,
24 | bf16* gy,
25 | bf16* gr,
26 | bf16* gk,
27 | bf16* gv,
28 | bf16* gw,
29 | bf16* gu);
30 |
31 | void forward(int64_t B,
32 | int64_t T,
33 | int64_t C,
34 | int64_t H,
35 | torch::Tensor& r,
36 | torch::Tensor& k,
37 | torch::Tensor& v,
38 | torch::Tensor& w,
39 | torch::Tensor& u,
40 | torch::Tensor& y)
41 | {
42 | cuda_forward(B,
43 | T,
44 | C,
45 | H,
46 | r.data_ptr(),
47 | k.data_ptr(),
48 | v.data_ptr(),
49 | w.data_ptr(),
50 | u.data_ptr(),
51 | y.data_ptr());
52 | }
53 | void backward(int64_t B,
54 | int64_t T,
55 | int64_t C,
56 | int64_t H,
57 | torch::Tensor& r,
58 | torch::Tensor& k,
59 | torch::Tensor& v,
60 | torch::Tensor& w,
61 | torch::Tensor& u,
62 | torch::Tensor& gy,
63 | torch::Tensor& gr,
64 | torch::Tensor& gk,
65 | torch::Tensor& gv,
66 | torch::Tensor& gw,
67 | torch::Tensor& gu)
68 | {
69 | cuda_backward(B,
70 | T,
71 | C,
72 | H,
73 | r.data_ptr(),
74 | k.data_ptr(),
75 | v.data_ptr(),
76 | w.data_ptr(),
77 | u.data_ptr(),
78 | gy.data_ptr(),
79 | gr.data_ptr(),
80 | gk.data_ptr(),
81 | gv.data_ptr(),
82 | gw.data_ptr(),
83 | gu.data_ptr());
84 | }
85 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
86 | {
87 | m.def("forward", &forward, "wkv6 forward");
88 | m.def("backward", &backward, "wkv6 backward");
89 | }
90 |
91 | TORCH_LIBRARY(wkv6, m)
92 | {
93 | m.def("forward", forward);
94 | m.def("backward", backward);
95 | }
96 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_fire.yml:
--------------------------------------------------------------------------------
1 | {
2 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
3 | # across the node boundaries )
4 | "pipe-parallel-size": 1,
5 | "model-parallel-size": 1,
6 |
7 | # model settings
8 | "num-layers": 12,
9 | "hidden-size": 768,
10 | "num-attention-heads": 12,
11 | "seq-length": 512,
12 | "max-position-embeddings": 2048,
13 | "norm": "layernorm",
14 | "pos-emb": "fire",
15 | "no-weight-tying": true,
16 | "gpt_j_residual": false,
17 | "output_layer_parallelism": "column",
18 |
19 | # these should provide some speedup but takes a while to build, set to true if desired
20 | "scaled-upper-triang-masked-softmax-fusion": false,
21 | "bias-gelu-fusion": false,
22 |
23 | "mlp_width": 32,
24 | "noise_seq_length": 128,
25 | # init methods
26 | "init_method": "small_init",
27 | "output_layer_init_method": "wang_init",
28 |
29 |
30 | # optimizer settings
31 | "optimizer": {
32 | "type": "Adam",
33 | "params": {
34 | "lr": 0.0006,
35 | "betas": [0.9, 0.95],
36 | "eps": 1.0e-8,
37 | }
38 | },
39 | "min_lr": 0.00006,
40 |
41 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 | "zero_optimization": {
43 | "stage": 1,
44 | "allgather_partitions": True,
45 | "allgather_bucket_size": 500000000,
46 | "overlap_comm": True,
47 | "reduce_scatter": True,
48 | "reduce_bucket_size": 500000000,
49 | "contiguous_gradients": True,
50 | },
51 |
52 | # batch / data settings
53 | "train_micro_batch_size_per_gpu": 32,
54 | "data-impl": "mmap",
55 |
56 | # activation checkpointing
57 | "checkpoint-activations": true,
58 | "checkpoint-num-layers": 1,
59 | "partition-activations": true,
60 | "synchronize-each-layer": true,
61 |
62 | # regularization
63 | "gradient_clipping": 1.0,
64 | "weight-decay": 0.1,
65 | "hidden-dropout": 0.0,
66 | "attention-dropout": 0.0,
67 |
68 | # precision settings
69 | "fp16": {
70 | "enabled": true,
71 | "loss_scale": 0,
72 | "loss_scale_window": 1000,
73 | "hysteresis": 2,
74 | "min_loss_scale": 1
75 | },
76 |
77 | # misc. training settings
78 | "train-iters": 50000,
79 | "lr-decay-iters": 50000,
80 | "distributed-backend": "nccl",
81 | "lr-decay-style": "cosine",
82 | "warmup": 0.01,
83 | "checkpoint-factor": 10000,
84 | "eval-interval": 5000,
85 | "eval-iters": 20,
86 |
87 | # logging
88 | "log-interval": 100,
89 | "steps_per_print": 10,
90 | "keep-last-n-checkpoints": 4,
91 | "wall_clock_breakdown": true,
92 |
93 | # networking
94 | "hostfile": "/mock_path",
95 | "save": "checkpoints",
96 | "load": "checkpoints",
97 | "tensorboard-dir": "tensorboard",
98 | "log-dir": "logs",
99 | }
100 |
--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
1 | /* coding=utf-8
2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 |
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 |
25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);
26 |
27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
28 | torch::Tensor const& softmax_results,
29 | float scale_factor);
30 |
31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor)
32 | {
33 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
34 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
35 | (input.scalar_type() == at::ScalarType::BFloat16),
36 | "Only fp16 and bf16 are supported");
37 |
38 | return fwd_cuda(input, scale_factor);
39 | }
40 |
41 | torch::Tensor bwd(torch::Tensor const& output_grads,
42 | torch::Tensor const& softmax_results,
43 | float scale_factor)
44 | {
45 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
46 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
47 |
48 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
49 | (output_grads.scalar_type() == at::ScalarType::BFloat16),
50 | "Only fp16 and bf16 are supported");
51 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
52 | (softmax_results.scalar_type() == at::ScalarType::BFloat16),
53 | "Only fp16 and bf16 are supported");
54 |
55 | return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 |
58 | } // end namespace scaled_upper_triang_masked_softmax
59 | } // end namespace fused_softmax
60 | } // end namespace multihead_attn
61 |
62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
63 | {
64 | m.def("forward",
65 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
66 | "Self Multihead Attention scaled, time masked softmax -- Forward.");
67 | m.def("backward",
68 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
69 | "Self Multihead Attention scaled, time masked softmax -- Backward.");
70 | }
71 |
--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_usage.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | plausibility check for the usage of neox_args in the megatron codebase
17 | """
18 | import pytest
19 | import re
20 | from ..common import get_root_directory
21 |
22 |
23 | @pytest.mark.cpu
24 | def test_neoxargs_usage():
25 | """ "
26 | checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs
27 | """
28 | from megatron.neox_arguments import NeoXArgs
29 |
30 | declared_all = True
31 | neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys())
32 |
33 | # we exclude a number of properties (implemented with the @property decorator) or functions that we know exists
34 | exclude = set(
35 | [
36 | "params_dtype",
37 | "deepspeed_config",
38 | "get",
39 | "pop",
40 | "get_deepspeed_main_args",
41 | 'optimizer["params"]',
42 | "attention_config[layer_number]",
43 | "adlr_autoresume_object",
44 | "update_value",
45 | "all_config",
46 | "tensorboard_writer",
47 | "tokenizer",
48 | "train_batch_size]",
49 | "items",
50 | "configure_distributed_args",
51 | "build_tokenizer",
52 | "attention_config[i]",
53 | "print",
54 | "update",
55 | ]
56 | )
57 |
58 | # test file by file
59 | for filename in (get_root_directory() / "megatron").glob("**/*.py"):
60 | if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]:
61 | continue
62 |
63 | # load file
64 | with open(filename, "r") as f:
65 | file_contents = f.read()
66 |
67 | # find args matches
68 | matches = list(
69 | re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
70 | )
71 | if len(matches) == 0:
72 | continue
73 |
74 | # compare
75 | for match in matches:
76 | if match not in neox_args_attributes and match not in exclude:
77 | print(
78 | f"(arguments used not found in neox args): {filename.name}: {match}",
79 | flush=True,
80 | )
81 | declared_all = False
82 |
83 | assert declared_all, "all arguments used in code defined in NeoXArgs"
84 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_alibi.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "alibi",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | "mlp_width": 32,
25 | "noise_seq_length": 128,
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 |
31 | # optimizer settings
32 | "optimizer": {
33 | "type": "Adam",
34 | "params": {
35 | "lr": 0.0006,
36 | "betas": [0.9, 0.95],
37 | "eps": 1.0e-8,
38 | }
39 | },
40 | "min_lr": 0.00006,
41 |
42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43 | "zero_optimization": {
44 | "stage": 1,
45 | "allgather_partitions": True,
46 | "allgather_bucket_size": 500000000,
47 | "overlap_comm": True,
48 | "reduce_scatter": True,
49 | "reduce_bucket_size": 500000000,
50 | "contiguous_gradients": True,
51 | },
52 |
53 | # batch / data settings
54 | "train_micro_batch_size_per_gpu": 32,
55 | "data-impl": "mmap",
56 |
57 | # activation checkpointing
58 | "checkpoint-activations": true,
59 | "checkpoint-num-layers": 1,
60 | "partition-activations": true,
61 | "synchronize-each-layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight-decay": 0.1,
66 | "hidden-dropout": 0.0,
67 | "attention-dropout": 0.0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train-iters": 50000,
80 | "lr-decay-iters": 50000,
81 | "distributed-backend": "nccl",
82 | "lr-decay-style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint-factor": 10000,
85 | "eval-interval": 5000,
86 | "eval-iters": 20,
87 |
88 | # logging
89 | "log-interval": 100,
90 | "steps_per_print": 10,
91 | "keep-last-n-checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 |
94 | # networking
95 | "hostfile": "/mock_path",
96 | "save": "checkpoints",
97 | "load": "checkpoints",
98 | "tensorboard-dir": "tensorboard",
99 | "log-dir": "logs",
100 | }
101 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_alibi_c.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "alibi_c",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | "mlp_width": 32,
25 | "noise_seq_length": 128,
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 |
31 | # optimizer settings
32 | "optimizer": {
33 | "type": "Adam",
34 | "params": {
35 | "lr": 0.0006,
36 | "betas": [0.9, 0.95],
37 | "eps": 1.0e-8,
38 | }
39 | },
40 | "min_lr": 0.00006,
41 |
42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43 | "zero_optimization": {
44 | "stage": 1,
45 | "allgather_partitions": True,
46 | "allgather_bucket_size": 500000000,
47 | "overlap_comm": True,
48 | "reduce_scatter": True,
49 | "reduce_bucket_size": 500000000,
50 | "contiguous_gradients": True,
51 | },
52 |
53 | # batch / data settings
54 | "train_micro_batch_size_per_gpu": 32,
55 | "data-impl": "mmap",
56 |
57 | # activation checkpointing
58 | "checkpoint-activations": true,
59 | "checkpoint-num-layers": 1,
60 | "partition-activations": true,
61 | "synchronize-each-layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight-decay": 0.1,
66 | "hidden-dropout": 0.0,
67 | "attention-dropout": 0.0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train-iters": 50000,
80 | "lr-decay-iters": 50000,
81 | "distributed-backend": "nccl",
82 | "lr-decay-style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint-factor": 10000,
85 | "eval-interval": 5000,
86 | "eval-iters": 20,
87 |
88 | # logging
89 | "log-interval": 100,
90 | "steps_per_print": 10,
91 | "keep-last-n-checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 |
94 | # networking
95 | "hostfile": "/mock_path",
96 | "save": "checkpoints",
97 | "load": "checkpoints",
98 | "tensorboard-dir": "tensorboard",
99 | "log-dir": "logs",
100 | }
101 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_kerple.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "kerple",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | "mlp_width": 32,
25 | "noise_seq_length": 128,
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 |
31 | # optimizer settings
32 | "optimizer": {
33 | "type": "Adam",
34 | "params": {
35 | "lr": 0.0006,
36 | "betas": [0.9, 0.95],
37 | "eps": 1.0e-8,
38 | }
39 | },
40 | "min_lr": 0.00006,
41 |
42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43 | "zero_optimization": {
44 | "stage": 1,
45 | "allgather_partitions": True,
46 | "allgather_bucket_size": 500000000,
47 | "overlap_comm": True,
48 | "reduce_scatter": True,
49 | "reduce_bucket_size": 500000000,
50 | "contiguous_gradients": True,
51 | },
52 |
53 | # batch / data settings
54 | "train_micro_batch_size_per_gpu": 32,
55 | "data-impl": "mmap",
56 |
57 | # activation checkpointing
58 | "checkpoint-activations": true,
59 | "checkpoint-num-layers": 1,
60 | "partition-activations": true,
61 | "synchronize-each-layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight-decay": 0.1,
66 | "hidden-dropout": 0.0,
67 | "attention-dropout": 0.0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train-iters": 50000,
80 | "lr-decay-iters": 50000,
81 | "distributed-backend": "nccl",
82 | "lr-decay-style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint-factor": 5000,
85 | "eval-interval": 5000,
86 | "eval-iters": 20,
87 |
88 | # logging
89 | "log-interval": 100,
90 | "steps_per_print": 10,
91 | "keep-last-n-checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 |
94 | # networking
95 | "hostfile": "/mock_path",
96 | "save": "checkpoints",
97 | "load": "checkpoints",
98 | "tensorboard-dir": "tensorboard",
99 | "log-dir": "logs",
100 | }
101 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_fire_c.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "fire_c",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | "mlp_width": 32,
25 | "noise_seq_length": 128,
26 | # init methods
27 | "init_method": "small_init",
28 | "output_layer_init_method": "wang_init",
29 |
30 |
31 | # optimizer settings
32 | "optimizer": {
33 | "type": "Adam",
34 | "params": {
35 | "lr": 0.0006,
36 | "betas": [0.9, 0.95],
37 | "eps": 1.0e-8,
38 | }
39 | },
40 | "min_lr": 0.00006,
41 |
42 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43 | "zero_optimization": {
44 | "stage": 1,
45 | "allgather_partitions": True,
46 | "allgather_bucket_size": 500000000,
47 | "overlap_comm": True,
48 | "reduce_scatter": True,
49 | "reduce_bucket_size": 500000000,
50 | "contiguous_gradients": True,
51 | },
52 |
53 | # batch / data settings
54 | "train_micro_batch_size_per_gpu": 32,
55 | "data-impl": "mmap",
56 |
57 | # activation checkpointing
58 | "checkpoint-activations": true,
59 | "checkpoint-num-layers": 1,
60 | "partition-activations": true,
61 | "synchronize-each-layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight-decay": 0.1,
66 | "hidden-dropout": 0.0,
67 | "attention-dropout": 0.0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train-iters": 50000,
80 | "lr-decay-iters": 50000,
81 | "distributed-backend": "nccl",
82 | "lr-decay-style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint-factor": 10000,
85 | "eval-interval": 5000,
86 | "eval-iters": 20,
87 |
88 | # logging
89 | "log-interval": 100,
90 | "steps_per_print": 10,
91 | "keep-last-n-checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 |
94 | # networking
95 | "hostfile": "/mock_path",
96 | "save": "checkpoints",
97 | "load": "checkpoints",
98 | "tensorboard-dir": "tensorboard",
99 | "log-dir": "logs",
100 | }
101 |
--------------------------------------------------------------------------------
/configs/125M-moe.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # See README for MoE config docs!
4 | "moe_type": "deepspeed",
5 | "moe_token_dropping": true,
6 | # Have 4 experts per layer (every 2 layers by default)
7 | "moe_num_experts": 4,
8 | # parallelism settings
9 | "enable_expert_tensor_parallelism": true,
10 | "pipe_parallel_size": 1, # not yet supported for MoE
11 | "model_parallel_size": 1,
12 | "moe_expert_parallel_size": 1,
13 |
14 | # model settings
15 | "num_layers": 12,
16 | "hidden_size": 768,
17 | "num_attention_heads": 12,
18 | "seq_length": 2048,
19 | "max_position_embeddings": 2048,
20 | "norm": "layernorm",
21 | "pos_emb": "rotary",
22 | "no_weight_tying": true,
23 | "gpt_j_residual": false,
24 | "output_layer_parallelism": "column",
25 |
26 | # these should provide some speedup but takes a while to build, set to true if desired
27 | "scaled_upper_triang_masked_softmax_fusion": false,
28 | "bias_gelu_fusion": false,
29 | "rope_fusion": false,
30 |
31 | # init methods
32 | "init_method": "small_init",
33 | "output_layer_init_method": "wang_init",
34 |
35 |
36 | # optimizer settings
37 | "optimizer": {
38 | "type": "Adam",
39 | "params": {
40 | "lr": 0.0006,
41 | "betas": [0.9, 0.95],
42 | "eps": 1.0e-8,
43 | }
44 | },
45 | "min_lr": 0.00006,
46 |
47 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
48 | "zero_optimization": {
49 | "stage": 1,
50 | "allgather_partitions": True,
51 | "allgather_bucket_size": 500000000,
52 | "overlap_comm": True,
53 | "reduce_scatter": True,
54 | "reduce_bucket_size": 500000000,
55 | "contiguous_gradients": True,
56 | },
57 |
58 | # batch / data settings
59 | "train_micro_batch_size_per_gpu": 4,
60 | "data_impl": "mmap",
61 |
62 | # activation checkpointing
63 | "checkpoint_activations": true,
64 | "checkpoint_num_layers": 1,
65 | "partition_activations": true,
66 | "synchronize_each_layer": true,
67 |
68 | # regularization
69 | "gradient_clipping": 1.0,
70 | "weight_decay": 0.1,
71 | "hidden_dropout": 0.0,
72 | "attention_dropout": 0.0,
73 |
74 | # precision settings
75 | "fp16": {
76 | "enabled": true,
77 | "loss_scale": 0,
78 | "loss_scale_window": 1000,
79 | "hysteresis": 2,
80 | "min_loss_scale": 1
81 | },
82 |
83 | # misc. training settings
84 | "train_iters": 320000,
85 | "lr_decay_iters": 320000,
86 | "distributed_backend": "nccl",
87 | "lr_decay_style": "cosine",
88 | "warmup": 0.01,
89 | "checkpoint_factor": 10000,
90 | "eval_interval": 1000,
91 | "eval_iters": 10,
92 |
93 | # logging
94 | "log_interval": 10,
95 | "steps_per_print": 10,
96 | "keep_last_n_checkpoints": 4,
97 | "wall_clock_breakdown": true,
98 |
99 | # networking
100 | "hostfile": "/mock_path"
101 | }
102 |
--------------------------------------------------------------------------------
/configs/125M-dmoe.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # See README for MoE config docs!
4 | "moe_type": "megablocks",
5 | "moe_token_dropping": false,
6 | # Have 4 experts per layer (every 2 layers by default)
7 | "moe_num_experts": 4,
8 | # parallelism settings
9 | "enable_expert_tensor_parallelism": true,
10 | "pipe_parallel_size": 1, # not yet supported for MoE
11 | "model_parallel_size": 1,
12 | "moe_expert_parallel_size": 1,
13 |
14 | # model settings
15 | "num_layers": 12,
16 | "hidden_size": 768,
17 | "num_attention_heads": 12,
18 | "seq_length": 2048,
19 | "max_position_embeddings": 2048,
20 | "norm": "layernorm",
21 | "pos_emb": "rotary",
22 | "no_weight_tying": true,
23 | "gpt_j_residual": false,
24 | "output_layer_parallelism": "column",
25 |
26 | # these should provide some speedup but takes a while to build, set to true if desired
27 | "scaled_upper_triang_masked_softmax_fusion": false,
28 | "bias_gelu_fusion": false,
29 | "rope_fusion": false,
30 |
31 | # init methods
32 | "init_method": "small_init",
33 | "output_layer_init_method": "wang_init",
34 |
35 |
36 | # optimizer settings
37 | "optimizer": {
38 | "type": "Adam",
39 | "params": {
40 | "lr": 0.0006,
41 | "betas": [0.9, 0.95],
42 | "eps": 1.0e-8,
43 | }
44 | },
45 | "min_lr": 0.00006,
46 |
47 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
48 | "zero_optimization": {
49 | "stage": 0,
50 | "allgather_partitions": True,
51 | "allgather_bucket_size": 500000000,
52 | "overlap_comm": True,
53 | "reduce_scatter": True,
54 | "reduce_bucket_size": 500000000,
55 | "contiguous_gradients": True,
56 | },
57 |
58 | # batch / data settings
59 | "train_micro_batch_size_per_gpu": 4,
60 | "data_impl": "mmap",
61 |
62 | # activation checkpointing
63 | "checkpoint_activations": true,
64 | "checkpoint_num_layers": 1,
65 | "partition_activations": true,
66 | "synchronize_each_layer": true,
67 |
68 | # regularization
69 | "gradient_clipping": 1.0,
70 | "weight_decay": 0.1,
71 | "hidden_dropout": 0.0,
72 | "attention_dropout": 0.0,
73 |
74 | # precision settings
75 | "fp16": {
76 | "enabled": true,
77 | "loss_scale": 0,
78 | "loss_scale_window": 1000,
79 | "hysteresis": 2,
80 | "min_loss_scale": 1
81 | },
82 |
83 | # misc. training settings
84 | "train_iters": 320000,
85 | "lr_decay_iters": 320000,
86 | "distributed_backend": "nccl",
87 | "lr_decay_style": "cosine",
88 | "warmup": 0.01,
89 | "checkpoint_factor": 10000,
90 | "eval_interval": 1000,
91 | "eval_iters": 10,
92 |
93 | # logging
94 | "log_interval": 10,
95 | "steps_per_print": 10,
96 | "keep_last_n_checkpoints": 4,
97 | "wall_clock_breakdown": true,
98 |
99 | # networking
100 | "hostfile": "/mock_path"
101 | }
102 |
--------------------------------------------------------------------------------
/configs/125M/512/125M.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "rotary",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | # init methods
25 | "init_method": "small_init",
26 | "output_layer_init_method": "wang_init",
27 |
28 |
29 | # optimizer settings
30 | "optimizer": {
31 | "type": "Adam",
32 | "params": {
33 | "lr": 0.0006,
34 | "betas": [0.9, 0.95],
35 | "eps": 1.0e-8,
36 | }
37 | },
38 | "min_lr": 0.00006,
39 |
40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 | "zero_optimization": {
42 | "stage": 1,
43 | "allgather_partitions": True,
44 | "allgather_bucket_size": 500000000,
45 | "overlap_comm": True,
46 | "reduce_scatter": True,
47 | "reduce_bucket_size": 500000000,
48 | "contiguous_gradients": True,
49 | },
50 |
51 | # batch / data settings
52 | "train_micro_batch_size_per_gpu": 32,
53 | "data-impl": "mmap",
54 |
55 | "mlp_width": 32,
56 |
57 | # activation checkpointing
58 | "checkpoint-activations": true,
59 | "checkpoint-num-layers": 1,
60 | "partition-activations": true,
61 | "synchronize-each-layer": true,
62 |
63 | # regularization
64 | "gradient_clipping": 1.0,
65 | "weight-decay": 0.1,
66 | "hidden-dropout": 0.0,
67 | "attention-dropout": 0.0,
68 |
69 | # precision settings
70 | "fp16": {
71 | "enabled": true,
72 | "loss_scale": 0,
73 | "loss_scale_window": 1000,
74 | "hysteresis": 2,
75 | "min_loss_scale": 1
76 | },
77 |
78 | # misc. training settings
79 | "train-iters": 50000,
80 | "lr-decay-iters": 50000,
81 | "distributed-backend": "nccl",
82 | "lr-decay-style": "cosine",
83 | "warmup": 0.01,
84 | "checkpoint-factor": 10000,
85 | "eval-interval": 5000,
86 | "eval-iters": 20,
87 |
88 | # logging
89 | "log-interval": 100,
90 | "steps_per_print": 10,
91 | "keep-last-n-checkpoints": 4,
92 | "wall_clock_breakdown": true,
93 |
94 | # networking
95 | "hostfile": "/mock_path",
96 | "save": "checkpoints/125M/none_c_ffn",
97 | "load": "checkpoints/125M/none_c_ffn",
98 | "tensorboard-dir": "tensorboard/125M/none_c_ffn",
99 | "log-dir": "logs/125M/none_c_ffn",
100 | }
101 |
--------------------------------------------------------------------------------
/configs/125M/512/125M_fire_capev2.yml:
--------------------------------------------------------------------------------
1 | # GPT-2 pretraining setup
2 | {
3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4 | # across the node boundaries )
5 | "pipe-parallel-size": 1,
6 | "model-parallel-size": 1,
7 |
8 | # model settings
9 | "num-layers": 12,
10 | "hidden-size": 768,
11 | "num-attention-heads": 12,
12 | "seq-length": 512,
13 | "max-position-embeddings": 2048,
14 | "norm": "layernorm",
15 | "pos-emb": "capev2",
16 | "no-weight-tying": true,
17 | "gpt_j_residual": false,
18 | "output_layer_parallelism": "column",
19 |
20 | # these should provide some speedup but takes a while to build, set to true if desired
21 | "scaled-upper-triang-masked-softmax-fusion": false,
22 | "bias-gelu-fusion": false,
23 |
24 | "mlp_width": 32,
25 | "capev2_kernel": 3,
26 | "noise_seq_length": 512,
27 | # init methods
28 | "init_method": "small_init",
29 | "output_layer_init_method": "wang_init",
30 |
31 |
32 | # optimizer settings
33 | "optimizer": {
34 | "type": "Adam",
35 | "params": {
36 | "lr": 0.0006,
37 | "betas": [0.9, 0.95],
38 | "eps": 1.0e-8,
39 | }
40 | },
41 | "min_lr": 0.00006,
42 |
43 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
44 | "zero_optimization": {
45 | "stage": 1,
46 | "allgather_partitions": True,
47 | "allgather_bucket_size": 500000000,
48 | "overlap_comm": True,
49 | "reduce_scatter": True,
50 | "reduce_bucket_size": 500000000,
51 | "contiguous_gradients": True,
52 | },
53 |
54 | # batch / data settings
55 | "train_micro_batch_size_per_gpu": 32,
56 | "data-impl": "mmap",
57 |
58 | # activation checkpointing
59 | "checkpoint-activations": true,
60 | "checkpoint-num-layers": 1,
61 | "partition-activations": true,
62 | "synchronize-each-layer": true,
63 |
64 | # regularization
65 | "gradient_clipping": 1.0,
66 | "weight-decay": 0.1,
67 | "hidden-dropout": 0.0,
68 | "attention-dropout": 0.0,
69 |
70 | # precision settings
71 | "fp16": {
72 | "enabled": true,
73 | "loss_scale": 0,
74 | "loss_scale_window": 1000,
75 | "hysteresis": 2,
76 | "min_loss_scale": 1
77 | },
78 |
79 | # misc. training settings
80 | "train-iters": 50000,
81 | "lr-decay-iters": 50000,
82 | "distributed-backend": "nccl",
83 | "lr-decay-style": "cosine",
84 | "warmup": 0.01,
85 | "checkpoint-factor": 10000,
86 | "eval-interval": 5000,
87 | "eval-iters": 20,
88 |
89 | # logging
90 | "log-interval": 100,
91 | "steps_per_print": 10,
92 | "keep-last-n-checkpoints": 4,
93 | "wall_clock_breakdown": true,
94 |
95 | # networking
96 | "hostfile": "/mock_path",
97 | "save": "checkpoints",
98 | "load": "checkpoints",
99 | "tensorboard-dir": "tensorboard",
100 | "log-dir": "logs",
101 | }
102 |
--------------------------------------------------------------------------------