├── tests
    ├── __init__.py
    ├── unit
    │   ├── __init__.py
    │   ├── test_dependencies.py
    │   ├── test_tokenizer.py
    │   ├── test_url_accessibility.py
    │   ├── test_format_conversion_scripts.py
    │   └── test_arguments.py
    ├── data
    │   └── sample_prompt.txt
    ├── neox_args
    │   ├── __init__.py
    │   ├── test_neoxargs_implementation.py
    │   └── test_neoxargs_usage.py
    ├── cpu_tests
    │   └── docker-compose.yml
    ├── model
    │   └── __init__.py
    ├── pytest.ini
    └── config
    │   └── test_setup.yml
├── tools
    ├── __init__.py
    ├── bash
    │   ├── kill.sh
    │   ├── killall.sh
    │   ├── README.md
    │   ├── sync_cmd.sh
    │   ├── sync.sh
    │   └── syncdir.sh
    ├── README.md
    ├── datasets
    │   ├── dataset_token_count.py
    │   ├── multinode_prepare_data.sh
    │   └── merge_datasets.py
    └── ckpts
    │   └── upload.py
├── megatron
    ├── model
    │   ├── rwkv
    │   │   ├── __init__.py
    │   │   └── v6
    │   │   │   ├── __init__.py
    │   │   │   └── cuda
    │   │   │       └── wkv6_op.cpp
    │   ├── mamba
    │   │   └── __init__.py
    │   ├── megablocks_utils.py
    │   ├── __init__.py
    │   └── fused_bias_dropout.py
    ├── data
    │   ├── __init__.py
    │   ├── Makefile
    │   └── blendable_dataset.py
    ├── gradient_noise_scale
    │   └── __init__.py
    ├── tokenizer
    │   └── __init__.py
    ├── fused_kernels
    │   ├── compat.h
    │   └── scaled_upper_triang_masked_softmax.cpp
    ├── __init__.py
    ├── devutil.py
    ├── mpu
    │   ├── random.py
    │   └── __init__.py
    └── neox_arguments
    │   └── template.py
├── .dockerignore
├── requirements
    ├── requirements-apex-pip.txt
    ├── requirements-comet.txt
    ├── requirements-wandb.txt
    ├── requirements-sparseattention.txt
    ├── requirements-flashattention.txt
    ├── requirements-onebitadam.txt
    ├── requirements-s3.txt
    ├── requirements-tensorboard.txt
    ├── requirements-transformerengine.txt
    ├── requirements-mamba.txt
    ├── requirements-dev.txt
    └── requirements.txt
├── MANIFEST.in
├── images
    ├── memory_profiling.png
    ├── nsight_profiling.png
    └── pytorch_profiling.png
├── .idea
    ├── misc.xml
    ├── vcs.xml
    ├── .gitignore
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── modules.xml
    └── DAPE.iml
├── configs
    ├── cpu_mock_config.yml
    ├── slurm_local.json
    ├── slurm_local.yml
    ├── prof.yml
    ├── docker
    │   └── pythia-paths.yml
    ├── text_generation.yml
    ├── sparse.yml
    ├── llama
    │   ├── 13B.yml
    │   ├── 30B.yml
    │   ├── 65B.yml
    │   ├── 7B.yml
    │   ├── README.md
    │   └── train_config.yml
    ├── llama2
    │   ├── 13B.yml
    │   ├── 7B.yml
    │   ├── 70B.yml
    │   ├── codellama_7B.yml
    │   └── codellama_34B.yml
    ├── local_setup.yml
    ├── eleutherai_cluster.yml
    ├── local_setup_wandb.yml
    ├── mistral
    │   └── 7B.yml
    ├── local_setup_comet.yml
    ├── slurm_125M.yml
    ├── 125M-json.yml
    ├── autotuning_configs
    │   ├── tune_6-7B.json
    │   ├── tune.json
    │   ├── small_tune.json
    │   └── tune_1-3B.json
    ├── gmlp_small.yml
    ├── pythia
    │   ├── 70M.yml
    │   ├── 160M.yml
    │   ├── 1-4B.yml
    │   ├── 410M.yml
    │   ├── 6-9B.yml
    │   ├── 12B.yml
    │   ├── 1B.yml
    │   ├── 2-8B.yml
    │   ├── 31M.yml
    │   └── 14M.yml
    ├── 800M.yml
    ├── finetuning_configs
    │   └── 6-9B.yml
    ├── bf16_125M.yml
    ├── mamba
    │   ├── mamba-130M.yml
    │   ├── mamba-370M.yml
    │   ├── mamba-1.4B.yml
    │   ├── mamba-2.8B.yml
    │   └── mamba-790M.yml
    ├── 49M.yml
    ├── bnb_125M.yml
    ├── 19M.yml
    ├── 175B.yml
    ├── 350M.yml
    ├── 1-3B.yml
    ├── 2-7B.yml
    ├── 6-7B.yml
    ├── 13B.yml
    ├── 760M.yml
    ├── 125M.yml
    ├── 125M
    │   └── 512
    │   │   ├── 125M_cope.yml
    │   │   ├── 125M_fire.yml
    │   │   ├── 125M_alibi.yml
    │   │   ├── 125M_alibi_c.yml
    │   │   ├── 125M_kerple.yml
    │   │   ├── 125M_fire_c.yml
    │   │   ├── 125M.yml
    │   │   └── 125M_fire_capev2.yml
    ├── rwkv
    │   └── 170M.yml
    ├── 125M-moe.yml
    └── 125M-dmoe.yml
├── docker-compose-dockerhub.yml
├── eval_tasks
    └── __init__.py
├── docker-compose.yml
├── deepy.py
├── train.py
├── .pre-commit-config.yaml
├── README-MUP.md
├── post-training
    ├── llama_data.py
    └── recreating_zephyr_dpo.md
├── prepare_data.py
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/megatron/model/rwkv/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | 20B_checkpoints/
2 | 


--------------------------------------------------------------------------------
/tools/bash/kill.sh:
--------------------------------------------------------------------------------
1 | pkill -9 python
2 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-apex-pip.txt:
--------------------------------------------------------------------------------
1 | pip==23.3.2
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-comet.txt:
--------------------------------------------------------------------------------
1 | comet_ml>=3.45.0
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-wandb.txt:
--------------------------------------------------------------------------------
1 | wandb>=0.10.28
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-sparseattention.txt:
--------------------------------------------------------------------------------
1 | triton==2.1.0
2 | 


--------------------------------------------------------------------------------
/tests/data/sample_prompt.txt:
--------------------------------------------------------------------------------
1 | Hello, I'm a language model
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-flashattention.txt:
--------------------------------------------------------------------------------
1 | flash-attn==2.5.6
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-onebitadam.txt:
--------------------------------------------------------------------------------
1 | cupy-cuda111>=8.6.0
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-s3.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | hf-transfer>=0.1.3
3 | 


--------------------------------------------------------------------------------
/requirements/requirements-tensorboard.txt:
--------------------------------------------------------------------------------
1 | tensorboard==2.13.0
2 | 


--------------------------------------------------------------------------------
/tools/bash/killall.sh:
--------------------------------------------------------------------------------
1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | 


--------------------------------------------------------------------------------
/megatron/gradient_noise_scale/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_noise_scale import GradientNoiseScale
2 | 


--------------------------------------------------------------------------------
/megatron/model/rwkv/v6/__init__.py:
--------------------------------------------------------------------------------
1 | from .rwkv import RWKVResidualLayerPipe, RWKVResidualLayer
2 | 


--------------------------------------------------------------------------------
/images/memory_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/memory_profiling.png


--------------------------------------------------------------------------------
/images/nsight_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/nsight_profiling.png


--------------------------------------------------------------------------------
/images/pytorch_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuanyang-Zheng/DAPE/HEAD/images/pytorch_profiling.png


--------------------------------------------------------------------------------
/requirements/requirements-transformerengine.txt:
--------------------------------------------------------------------------------
1 | pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
2 | 


--------------------------------------------------------------------------------
/tests/neox_args/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | testing of implementation of command line arguments and configuration (NeoXArgs)
3 | """
4 | 


--------------------------------------------------------------------------------
/megatron/model/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | from .mamba import (
2 |     ParallelMambaResidualLayer,
3 |     ParallelMambaResidualLayerPipe,
4 | )
5 | 


--------------------------------------------------------------------------------
/requirements/requirements-mamba.txt:
--------------------------------------------------------------------------------
1 | causal_conv1d>=1.1.0
2 | einops
3 | mamba_ssm>=1.2.0.post1 # required for untied embedding + unembedding layers
4 | 


--------------------------------------------------------------------------------
/requirements/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8>=1.5.6
2 | clang-format>=13.0.1
3 | pre-commit>=2.17.0
4 | pytest>=6.2.3
5 | pytest-cov>=2.11.1
6 | pytest-forked>=1.3.0
7 | pytest-html==4.1.1
8 | pytest-xdist
9 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/configs/cpu_mock_config.yml:
--------------------------------------------------------------------------------
1 | # CPU unit tests should be independent of the presence of GPUs on the test server
2 | # host. This configuration mocks these GPU resources and other dependencies.
3 | {
4 |   "global_num_gpus": 1
5 | }
6 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/tests/unit/test_dependencies.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from megatron import fused_kernels
3 | 
4 | 
5 | def test_fused_kernels():
6 |     pytest.xfail(reason="Fused kernels require manual intervention to install")
7 |     fused_kernels.load_fused_kernels()
8 | 


--------------------------------------------------------------------------------
/configs/slurm_local.json:
--------------------------------------------------------------------------------
1 | {
2 |   "vocab-file": "data/gpt2-vocab.json",
3 |   "merge-file": "data/gpt2-merges.txt",
4 |   "save": "checkpoints",
5 |   "checkpoint_validation_with_forward_pass": false,
6 |   "tensorboard-dir": "tensorboard",
7 |   "log-dir": "logs"
8 | }
9 | 


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/DAPE.iml" filepath="$PROJECT_DIR$/.idea/DAPE.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/configs/slurm_local.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_path": "data/enwik8/enwik8_text_document",
 3 |   "vocab_file": "data/gpt2-vocab.json",
 4 |   "merge_file": "data/gpt2-merges.txt",
 5 |   "save": "checkpoints",
 6 |   "checkpoint_validation_with_forward_pass": false,
 7 |   "tensorboard_dir": "tensorboard",
 8 |   "log_dir": "logs",
 9 | }
10 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/tests/unit/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from megatron.tokenizer import train_tokenizer
 3 | 
 4 | 
 5 | @pytest.mark.cpu
 6 | def test_train_tokenizer():
 7 |     input_args = [
 8 |         "--json_input_dir",
 9 |         "./tests/data/enwik8_first100.txt",
10 |         "--tokenizer_output_path",
11 |         "",
12 |     ]
13 |     args = train_tokenizer.parse_args(input_args)
14 |     train_tokenizer.main(args)
15 | 


--------------------------------------------------------------------------------
/configs/prof.yml:
--------------------------------------------------------------------------------
 1 | # Sample profiling config
 2 | {
 3 |   # Turns on nsys and pytorch profiling
 4 |   "profile": true,
 5 | 
 6 |   # pytorch profiler options
 7 |   "profile_step_start": 10,
 8 |   "profile_step_stop": 12,
 9 | 
10 |   # pytorch memory profiler options
11 |   "memory_profiling": true,
12 |   "memory_profiling_path": tensorboard,
13 | 
14 | 
15 |   # All trace files (pytorch, nsys, tensorboard, etc) will be written here
16 |   "tensorboard_dir": "tensorboard",
17 | }
18 | 


--------------------------------------------------------------------------------
/.idea/DAPE.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="PLAIN" />
10 |     <option name="myDocStringFormat" value="Plain" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
 2 | ftfy>=6.0.1
 3 | huggingface_hub>=0.11.0
 4 | jinja2==3.1.4
 5 | lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 6 | lm_eval>=0.4.0,<=0.4.1
 7 | mpi4py>=3.0.3
 8 | numpy<2.0
 9 | pybind11>=2.6.2
10 | regex
11 | sentencepiece
12 | six
13 | tiktoken>=0.1.2
14 | tokenizers>=0.12.1
15 | transformers==4.38.0
16 | 


--------------------------------------------------------------------------------
/tools/bash/README.md:
--------------------------------------------------------------------------------
1 | # Bash Scripts
2 | Useful for running distributed per-node scripts on e.g. Kubernetes
3 | 
4 | * `kill.sh` kills all python processes
5 | * `killall.sh` uses pdsh to kill all `train.py` processes on the nodes listed in `/job/hosts/`
6 | * `sync_cmd.sh` uses pdsh to run a command on all the nodes listed in `/job/hosts/`
7 | * `sync.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
8 | * `syncdir.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
9 | 


--------------------------------------------------------------------------------
/configs/docker/pythia-paths.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
 3 |   "valid-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
 4 |   "test-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
 5 | 
 6 |   "tokenizer-type": "HFTokenizer",
 7 |   "vocab-file": "/home/mchorse/data/tokenizers/20B_tokenizer.json",
 8 | 
 9 |   "save": "/home/mchorse/chk/",
10 |   "load": "/home/mchorse/chk/",
11 |   "checkpoint_validation_with_forward_pass": False
12 | }
13 | 


--------------------------------------------------------------------------------
/configs/text_generation.yml:
--------------------------------------------------------------------------------
 1 | # Parameters used for text generation
 2 | # Make sure `load` is specified somewhere else
 3 | {
 4 |   # Text gen type: `input-file`, `unconditional` or `interactive`
 5 |   "text_gen_type": "unconditional",
 6 | 
 7 |   # Params for all
 8 |   "maximum_tokens": 102,
 9 |   "prompt_end": "\n",
10 |   "temperature": 1.0,
11 |   "top_p": 0.0,
12 |   "top_k": 0,
13 |   "recompute": false,
14 | 
15 |   # `unconditional`: samples
16 |   "num_samples": 10,
17 | 
18 |   # input/output file
19 |   "sample_input_file": "sample_input.txt",
20 |   "sample_output_file": "sample_output.txt",
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/sparse.yml:
--------------------------------------------------------------------------------
 1 | # Add this to your config for sparse attention every other layer
 2 | {
 3 |   "attention_config": [[["local", "global"], "all"]],
 4 | 
 5 |   # sparsity config:
 6 |   # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
 7 |   # illustrative purposes)
 8 |   # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
 9 |   # more detailed config instructions and available parameters
10 | 
11 |   "sparsity_config": {
12 |     "block": 16, # block size
13 |     "num_local_blocks": 32,
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/tests/cpu_tests/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3' # slightly different to make sure CPU tests run without nvidia device
 2 | services:
 3 |   gpt-neox:
 4 |     command: nvidia-smi dmon
 5 |     image: gpt-neox
 6 |     build:
 7 |       context: .
 8 |       dockerfile: Dockerfile
 9 |     shm_size: 1g
10 |     ulimits:
11 |       memlock:
12 |         soft: -1
13 |         hard: -1
14 |     logging:
15 |       options:
16 |         max-size: "100m"
17 |         max-file: "3"
18 |     volumes:
19 |       - ${NEOX_DATA_PATH}:/home/mchorse/data
20 |       - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk
21 |       - .:/home/mchorse/gpt-neox
22 | 


--------------------------------------------------------------------------------
/tests/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/docker-compose-dockerhub.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   gpt-neox:
 4 |     command: nvidia-smi dmon
 5 |     image: leogao2/gpt-neox:main
 6 |     shm_size: 1g
 7 |     ulimits:
 8 |       memlock:
 9 |         soft: -1
10 |         hard: -1
11 |     runtime: nvidia
12 |     deploy:
13 |       resources:
14 |         reservations:
15 |           devices:
16 |             - driver: nvidia
17 |               capabilities: [gpu]
18 |     logging:
19 |       options:
20 |         max-size: "100m"
21 |         max-file: "3"
22 |     volumes:
23 |       - ${NEOX_DATA_PATH}:/home/mchorse/data
24 |       - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk
25 |       - .:/home/mchorse/gpt-neox
26 | 


--------------------------------------------------------------------------------
/eval_tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness
16 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   gpt-neox:
 4 |     command: nvidia-smi dmon
 5 |     image: gpt-neox
 6 |     build:
 7 |       context: .
 8 |       dockerfile: Dockerfile
 9 |     shm_size: 1g
10 |     ulimits:
11 |       memlock:
12 |         soft: -1
13 |         hard: -1
14 |     runtime: nvidia
15 |     deploy:
16 |       resources:
17 |         reservations:
18 |           devices:
19 |             - driver: nvidia
20 |               capabilities: [gpu]
21 |     logging:
22 |       options:
23 |         max-size: "100m"
24 |         max-file: "3"
25 |     volumes:
26 |       - ${NEOX_DATA_PATH}:/home/mchorse/data
27 |       - ${NEOX_CHECKPOINT_PATH}:/home/mchorse/chk
28 |       - .:/home/mchorse/gpt-neox
29 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from .tokenizer import build_tokenizer
17 | 


--------------------------------------------------------------------------------
/configs/llama/13B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 2,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 40,
 8 |   "hidden_size": 5120,
 9 |   "num_attention_heads": 40,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "activation": "swiglu",
25 |   "mlp_multiple_of": 256,
26 | }
27 | 


--------------------------------------------------------------------------------
/configs/llama/30B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 4,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 60,
 8 |   "hidden_size": 6656,
 9 |   "num_attention_heads": 52,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "activation": "swiglu",
25 |   "mlp_multiple_of": 256,
26 | }
27 | 


--------------------------------------------------------------------------------
/configs/llama/65B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 8,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 80,
 8 |   "hidden_size": 8192,
 9 |   "num_attention_heads": 64,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "activation": "swiglu",
25 |   "mlp_multiple_of": 256,
26 | }
27 | 


--------------------------------------------------------------------------------
/configs/llama/7B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 32,
 8 |   "hidden_size": 4096,
 9 |   "num_attention_heads": 32,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "activation": "swiglu",
25 |   "mlp_multiple_of": 256,
26 | }
27 | 


--------------------------------------------------------------------------------
/configs/llama2/13B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 2,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 40,
 8 |   "hidden_size": 5120,
 9 |   "num_attention_heads": 40,
10 |   "seq_length": 4096,
11 |   "max_position_embeddings": 4096,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-5,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "activation": "swiglu",
25 |   "mlp_multiple_of": 256,
26 | }
27 | 


--------------------------------------------------------------------------------
/configs/llama2/7B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 32,
 8 |   "hidden_size": 4096,
 9 |   "num_attention_heads": 32,
10 |   "seq_length": 4096,
11 |   "max_position_embeddings": 4096,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-5,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "activation": "swiglu",
25 |   "mlp_multiple_of": 256,
26 | }
27 | 


--------------------------------------------------------------------------------
/configs/llama/README.md:
--------------------------------------------------------------------------------
 1 | # LLaMA
 2 | 
 3 | ## Training and Finetuning
 4 | 
 5 | These configs contain the architecture settings required to run inference/training/finetuning on the [LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama) model suite.
 6 | 
 7 | LLaMA finetuning  can be launched with
 8 | ```sh
 9 | python ./deepy.py ./train.py -d configs llama/7B.yml llama/train_config.yml local_setup.yml
10 | ```
11 | 
12 | If training from scratch, set `finetune=False` in `./configs/llama/train_config.yml`.
13 | 
14 | 
15 | ## Inference
16 | 
17 | 
18 | LLaMA generation can be launched with
19 | ```sh
20 | python ./deepy.py ./generate.py -d configs  \
21 |   llama/7B.yml llama/train_config.yml local_setup.yml text_generation.yml \
22 |   -i input_prompt.txt -o prompt_out.txt
23 | ```
24 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-NeoX Auxiliary Tools
 2 | 
 3 | This directory contains a number of auxiliary tools that are useful for working with GPT-NeoX but not part of the main training code.
 4 | 
 5 | ## Bash
 6 | 
 7 | This directory contains some simple, frequently used bash commands to make working on multiple machines easier.
 8 | 
 9 | ## Checkpoints
10 | 
11 | This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x.
12 | 
13 | ## Datasets
14 | 
15 | This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library.
16 | 


--------------------------------------------------------------------------------
/tests/unit/test_url_accessibility.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import requests
 3 | 
 4 | from tools.datasets.corpora import DATA_DOWNLOADERS
 5 | 
 6 | 
 7 | def check_url_accessible(url):
 8 |     try:
 9 |         response = requests.head(url, timeout=5)
10 |         response.raise_for_status()
11 |         return True
12 |     except requests.exceptions.RequestException as e:
13 |         print(f"Error: Unable to access URL - {e}")
14 |         return False
15 | 
16 | 
17 | @pytest.mark.cpu
18 | @pytest.mark.parametrize("dataset_name", list(DATA_DOWNLOADERS.keys()))
19 | def test_url_accessibility(dataset_name):
20 |     if dataset_name == "pass":
21 |         return
22 |     elif not dataset_name == "enwik8":
23 |         pytest.xfail()
24 |     for url in DATA_DOWNLOADERS[dataset_name].urls:
25 |         assert check_url_accessible(url)
26 | 


--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [pytest]
16 | markers =
17 |     cpu: marks tests that can be run on cpu
18 | filterwarnings =
19 |     ignore::DeprecationWarning:pkg_resources.*
20 |     ignore::DeprecationWarning:torch.*
21 | 


--------------------------------------------------------------------------------
/tools/bash/sync_cmd.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Runs a command in parallel across all nodes
18 | # Usage
19 | # sync_cmd.sh 'echo "hello world"'
20 | 
21 | echo "Command: $1";
22 | pdsh -R ssh -w ^/job/hosts $1
23 | 


--------------------------------------------------------------------------------
/configs/llama2/70B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 80,
 8 |   "hidden_size": 8192,
 9 |   "intermediate_size": 28672,
10 |   "num_attention_heads": 64,
11 |   "num_kv_heads": 8,
12 |   "seq_length": 4096,
13 |   "max_position_embeddings": 4096,
14 |   "pos_emb": "rotary",
15 |   "rotary_pct": 1,
16 |   "rotary_emb_base": 1000000,
17 |   "no_weight_tying": true,
18 |   "gpt_j_residual": false,
19 |   "output_layer_parallelism": "column",
20 |   "norm": "rmsnorm",
21 |   "rms_norm_epsilon": 1.0e-5,
22 | 
23 |   "attention_config": [[["flash"], 80]],
24 | 
25 |   "scaled_upper_triang_masked_softmax_fusion": true,
26 |   "bias_gelu_fusion": false,
27 |   "use_bias_in_norms": false,
28 |   "use_bias_in_attn_linear": false,
29 |   "activation": "swiglu",
30 |   "mlp_multiple_of": 256,
31 | }
32 | 


--------------------------------------------------------------------------------
/configs/llama2/codellama_7B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 32,
 8 |   "hidden_size": 4096,
 9 |   "num_attention_heads": 32,
10 |   # Codellama was uptrained on 16k token sequence lengths
11 |   # with rotary_emb_base adjusted to 1_000_000.
12 |   "seq_length": 16384,
13 |   "max_position_embeddings": 16384,
14 |   "pos_emb": "rotary",
15 |   "rotary_pct": 1,
16 |   "rotary_emb_base": 1000000,
17 |   "no_weight_tying": true,
18 |   "gpt_j_residual": false,
19 |   "output_layer_parallelism": "column",
20 |   "norm": "rmsnorm",
21 |   "rms_norm_epsilon": 1.0e-5,
22 | 
23 |   "attention_config": [[["flash"], 32]],
24 | 
25 |   "scaled_upper_triang_masked_softmax_fusion": true,
26 |   "bias_gelu_fusion": false,
27 |   "use_bias_in_norms": false,
28 |   "use_bias_in_attn_linear": false,
29 |   "activation": "swiglu",
30 |   "mlp_multiple_of": 256,
31 | }
32 | 


--------------------------------------------------------------------------------
/tools/bash/sync.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     echo Uploading $full_path
27 |     pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
28 | done
29 | 


--------------------------------------------------------------------------------
/configs/llama2/codellama_34B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 48,
 8 |   "hidden_size": 8192,
 9 |   "num_attention_heads": 64,
10 |   "num_kv_heads": 8,
11 |   # Codellama was uptrained on 16k token sequence lengths
12 |   # with rotary_emb_base adjusted to 1_000_000.
13 |   "seq_length": 16384,
14 |   "max_position_embeddings": 16384,
15 |   "pos_emb": "rotary",
16 |   "rotary_pct": 1,
17 |   "rotary_emb_base": 1000000,
18 |   "no_weight_tying": true,
19 |   "gpt_j_residual": false,
20 |   "output_layer_parallelism": "column",
21 |   "norm": "rmsnorm",
22 |   "rms_norm_epsilon": 1.0e-5,
23 | 
24 |   "attention_config": [[["flash"], 48]],
25 | 
26 |   "scaled_upper_triang_masked_softmax_fusion": true,
27 |   "bias_gelu_fusion": false,
28 |   "use_bias_in_norms": false,
29 |   "use_bias_in_attn_linear": false,
30 |   "activation": "swiglu",
31 |   "mlp_multiple_of": 256,
32 | }
33 | 


--------------------------------------------------------------------------------
/megatron/model/megablocks_utils.py:
--------------------------------------------------------------------------------
 1 | """Adapter to expose MegaBlocks package, if available."""
 2 | 
 3 | try:
 4 |     import megablocks
 5 | except ImportError:
 6 |     megablocks = None
 7 | 
 8 | 
 9 | def megablocks_is_available():
10 |     return megablocks is not None
11 | 
12 | 
13 | def assert_megablocks_is_available():
14 |     assert (
15 |         megablocks_is_available()
16 |     ), "MegaBlocks not available. Please run `pip install megablocks`."
17 | 
18 | 
19 | moe = megablocks.layers.moe if megablocks_is_available() else None
20 | dmoe = megablocks.layers.dmoe if megablocks_is_available() else None
21 | arguments = megablocks.layers.arguments if megablocks_is_available() else None
22 | 
23 | 
24 | def as_megablocks_args(neox_args):
25 |     import copy
26 | 
27 |     tmp = copy.copy(neox_args)
28 |     args = arguments.from_megatron(tmp)
29 |     args.moe_lbl_in_fp32 = True
30 |     args.fp16 = neox_args.precision == "fp16"
31 |     args.moe_loss_weight = neox_args.moe_loss_coeff
32 |     return args
33 | 


--------------------------------------------------------------------------------
/tools/datasets/dataset_token_count.py:
--------------------------------------------------------------------------------
 1 | # Script counts tokens in a pretokenized dataset from preprocess_data.py
 2 | # Necessary for setting batch size, train_iters, etc
 3 | 
 4 | import sys
 5 | import os
 6 | 
 7 | ## Necessary for the import
 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 9 | sys.path.insert(0, project_root)
10 | 
11 | from megatron.data import indexed_dataset
12 | import numpy as np
13 | 
14 | if len(sys.argv) < 2:
15 |     print(
16 |         "Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..."
17 |     )
18 |     sys.exit(1)
19 | 
20 | # Access the command-line arguments
21 | arguments = sys.argv[1:]
22 | 
23 | for arg in arguments:
24 |     print("Checking file", arg)
25 |     try:
26 |         dataset = indexed_dataset.make_dataset(arg, "mmap")
27 |         size = np.sum(dataset.sizes)
28 |         print("Dataset size in tokens is", size)
29 |     except AttributeError:
30 |         print("Dataset could not be loaded", arg)
31 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied from NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/tools/bash/syncdir.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # syncdir.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     parentdir="$(dirname "$full_path")"
27 |     echo Uploading $full_path to $parentdir
28 |     pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
29 | done
30 | 


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | 
16 | 
17 | def print_rank_0(*message):
18 |     """If distributed is initialized print only on rank 0."""
19 |     if torch.distributed.is_initialized():
20 |         if torch.distributed.get_rank() == 0:
21 |             print(*message, flush=True)
22 |     else:
23 |         print(*message, flush=True)
24 | 
25 | 
26 | from .neox_arguments import NeoXArgs
27 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2024 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from .gpt2_model import GPT2ModelPipe
19 | from .utils import (
20 |     get_params_for_weight_decay_optimization,
21 |     mark_norms_for_sequence_parallel_grad_sync,
22 | )
23 | from .word_embeddings import SoftEmbedding
24 | 


--------------------------------------------------------------------------------
/tests/unit/test_format_conversion_scripts.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from tools.ckpts import convert_neox_to_hf
 3 | from tests.common import simulate_deepy_env, save_random_model
 4 | from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 5 | 
 6 | 
 7 | def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
 8 |     # Generate random GPT-NEOX model, check we can convert to hf format
 9 |     model_dir = str(tmpdir)
10 |     input_args = ["train.py", "tests/config/test_setup.yml"]
11 |     deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args)
12 |     save_random_model(deepspeed_main_args, model_dir, train_iters=1)
13 | 
14 |     # Generate output
15 |     script_args = [
16 |         "--config_file",
17 |         "tests/config/test_setup.yml",
18 |         "--input_dir",
19 |         model_dir + "/global_step1",
20 |         "--output_dir",
21 |         model_dir,
22 |     ]
23 |     overwrite_values = {"tokenizer_type": NeoXArgsTokenizer.tokenizer_type}
24 |     convert_neox_to_hf.main(input_args=script_args, overwrite_values=overwrite_values)
25 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_implementation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | check implementation of NeoXArgs for duplication errors (would overwrite)
17 | """
18 | import pytest
19 | 
20 | 
21 | @pytest.mark.cpu
22 | def test_neoxargs_duplicates():
23 |     """
24 |     tests that there are no duplicates among parent classes of NeoXArgs
25 |     """
26 |     from megatron import NeoXArgs
27 | 
28 |     assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates"
29 | 


--------------------------------------------------------------------------------
/configs/local_setup.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data_path": "data/enwik8/enwik8_text_document",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 7 |   # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 8 |   # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab_file": "data/gpt2-vocab.json",
19 |   "merge_file": "data/gpt2-merges.txt",
20 | 
21 |   "save": "checkpoints",
22 |   "load": "checkpoints",
23 |   "checkpoint_validation_with_forward_pass": False,
24 | 
25 |   "tensorboard_dir": "tensorboard",
26 |   "log_dir": "logs",
27 | }
28 | 


--------------------------------------------------------------------------------
/configs/eleutherai_cluster.yml:
--------------------------------------------------------------------------------
 1 | # Data paths and options when using EleutherAI cluster
 2 | {
 3 |   # you may include multiple distinct datasets if desired
 4 |   "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
 5 |   "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
 6 |   "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
 7 | 
 8 |   # if using multiple datasets, provide weights for them to be sampled with
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 | 
14 |   # If you would like the code to create val and test datasets from your training set use the following instead
15 |   # "split" determines the relative size of train, val, and test
16 | 
17 |   # "split" 995,4,1
18 |   # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
19 | 
20 |   "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
21 |   "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
22 |   "save": "/mnt/ssd-1/checkpoints",
23 |   "load": "/mnt/ssd-1/checkpoints",
24 |   "tensorboard_dir": "/mnt/ssd-1/tensorboard",
25 |   "log_dir": "/mnt/ssd-1/logs",
26 |   "wandb_team": "eleutherai",
27 |   "wandb_project": "neox",
28 |   "wandb_group": "example"
29 | }
30 | 


--------------------------------------------------------------------------------
/configs/local_setup_wandb.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data_path": "data/enwik8/enwik8_text_document",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 7 |   # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 8 |   # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab_file": "data/gpt2-vocab.json",
19 |   "merge_file": "data/gpt2-merges.txt",
20 | 
21 |   "save": "checkpoints",
22 |   "load": "checkpoints",
23 |   "checkpoint_validation_with_forward_pass": False,
24 | 
25 |   "tensorboard_dir": "tensorboard",
26 |   "log_dir": "logs",
27 |   "use_wandb": True,
28 |   "wandb_host": "https://api.wandb.ai",
29 |   "wandb_project": "neox"
30 | }
31 | 


--------------------------------------------------------------------------------
/megatron/devutil.py:
--------------------------------------------------------------------------------
 1 | import torch.cuda
 2 | 
 3 | 
 4 | class Metric:
 5 |     """
 6 |     Dumb utility to collect and report average wall-time metrics.
 7 |     """
 8 | 
 9 |     def __init__(self, label):
10 |         self.label = label
11 |         self.measurements = []
12 | 
13 |     def collect(self, measurement):
14 |         self.measurements.append(measurement)
15 | 
16 |     def get_measurements(self):
17 |         return self.measurements[:]
18 | 
19 |     def report(self):
20 |         print(
21 |             self.label,
22 |             torch.quantile(torch.tensor(self.measurements), torch.arange(10) / 10.0),
23 |         )
24 | 
25 | 
26 | def monitor_method_cuda_wall_times(metric, obj, methodname):
27 |     """
28 |     Measure timings for a method on an object or class.
29 | 
30 |     For instance:
31 | 
32 |     >>> metric = Metric('!LNORM')
33 |     >>> monitor_method_wall_times(metric, LayerNorm, 'forward')
34 |     """
35 |     oldmeth = getattr(obj, methodname)
36 | 
37 |     start_event = torch.cuda.Event(enable_timing=True)
38 |     end_event = torch.cuda.Event(enable_timing=True)
39 | 
40 |     def newmeth(*args, **kw):
41 |         start_event.record()
42 |         try:
43 |             return oldmeth(*args, **kw)
44 |         finally:
45 |             end_event.record()
46 |             torch.cuda.synchronize()
47 |             elapsed = start_event.elapsed_time(end_event)
48 |             metric.collect(elapsed)
49 |             metric.report()
50 | 
51 |     setattr(obj, methodname, newmeth)
52 | 


--------------------------------------------------------------------------------
/configs/mistral/7B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 32,
 8 |   "hidden_size": 4096,
 9 |   "intermediate_size": 14336,
10 |   "num_attention_heads": 32,
11 |   "num_kv_heads": 8,
12 |   # per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen
13 |   # and instruction tuned to 16384 seqlen, all with 4096 sliding window
14 |   "seq_length": 8192,
15 |   "sliding_window_width": 4096,
16 |   "max_position_embeddings": 131072,
17 |   "pos_emb": "rotary",
18 |   "rotary_pct": 1,
19 |   "rotary_emb_base": 10000,
20 |   "no_weight_tying": true,
21 |   "gpt_j_residual": false,
22 |   "output_layer_parallelism": "column",
23 |   "norm": "rmsnorm",
24 |   "rms_norm_epsilon": 1.0e-5,
25 | 
26 |   # Grouped Query Attention is supported for both default ("global")
27 |   # and Flash attention. However, we highly recommend the use of Flash attention
28 |   # to get FLOP + runtime speedups when using GQA,
29 |   # and sliding window attention is currently only supported by Flash attention.
30 |   "attention_config": [[["flash"], 32]],
31 | 
32 |   "scaled_upper_triang_masked_softmax_fusion": true,
33 |   "bias_gelu_fusion": false,
34 |   "use_bias_in_norms": false,
35 |   "use_bias_in_attn_linear": false,
36 |   "activation": "swiglu",
37 | 
38 |   "tokenizer_type": "SPMTokenizer",
39 |   #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/deepy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) 2024, EleutherAI
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | import os
18 | 
19 | import deepspeed.launcher.runner
20 | 
21 | 
22 | def main(input_args=None):
23 |     logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
24 | 
25 |     from megatron.neox_arguments import NeoXArgs
26 |     from megatron.utils import get_wandb_api_key
27 | 
28 |     neox_args = NeoXArgs.consume_deepy_args(input_args)
29 |     deepspeed_main_args = neox_args.get_deepspeed_main_args()
30 | 
31 |     # Extract wandb API key and inject into worker environments
32 |     wandb_token = get_wandb_api_key(neox_args=neox_args)
33 |     if wandb_token is not None:
34 |         deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35 |         os.environ["WANDB_API_KEY"] = wandb_token
36 | 
37 |     deepspeed.launcher.runner.main(deepspeed_main_args)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/configs/local_setup_comet.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data_path": "/workspace/gpt-neox-main/data/enwik8/enwik8_text_document",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 7 |   # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 8 |   # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab_file": "/workspace/gpt-neox-main/data/gpt2-vocab.json",
19 |   "merge_file": "/workspace/gpt-neox-main/data/gpt2-merges.txt",
20 | 
21 |   "save": "checkpoints",
22 |   "load": "checkpoints",
23 |   "checkpoint_validation_with_forward_pass": False,
24 | 
25 |   "tensorboard_dir": "tensorboard",
26 |   "log_dir": "logs",
27 |   "use_comet": True,
28 |   # "comet_workspace": "test_workspace", # CHANGE ME
29 |   "comet_project": "test_project",
30 |   "comet_experiment_name": "test_experiment",
31 |   "comet_tags": ["test_tag1", "test_tag2"],
32 |   "comet_others": {"test_others"},
33 | }
34 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Train"""
19 | from megatron.neox_arguments import NeoXArgs
20 | from megatron.training import pretrain
21 | 
22 | 
23 | def main(input_args=None, overwrite_values=None):
24 |     neox_args = NeoXArgs.consume_neox_args(
25 |         input_args=input_args, overwrite_values=overwrite_values
26 |     )
27 |     neox_args.configure_distributed_args()
28 |     neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
29 |     neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
30 |     neox_args.initialize_comet()  # is initialized if comet directory is defined
31 |     pretrain(neox_args=neox_args)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v4.1.0
 4 |       hooks:
 5 |           - id: check-case-conflict
 6 |           - id: check-json
 7 |           - id: check-symlinks
 8 |           - id: check-yaml
 9 |           - id: destroyed-symlinks
10 |           - id: end-of-file-fixer
11 |             exclude: ^(docs/CNAME/|configs/neox_arguments.md)
12 |           - id: fix-byte-order-marker
13 |           - id: fix-encoding-pragma
14 |             args: [--remove]
15 |           - id: mixed-line-ending
16 |             args: [--fix=lf]
17 |           - id: requirements-txt-fixer
18 |           - id: trailing-whitespace
19 |             exclude: ^(docs/CNAME/|configs/neox_arguments.md)
20 |     - repo: https://gitlab.com/daverona/pre-commit/cpp
21 |       rev: 0.8.0
22 |       hooks:
23 |           - id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
24 |             args: []
25 | 
26 |     - repo: https://github.com/psf/black
27 |       rev: 22.3.0
28 |       hooks:
29 |           - id: black
30 |             language_version: python3
31 |     - repo: https://github.com/codespell-project/codespell
32 |       rev: v2.1.0
33 |       hooks:
34 |       - id: codespell
35 |         args: [
36 |               '--ignore-words-list=reord,dout,te',  # Word used in error messages that need rewording. te --> transformerengine
37 |               --check-filenames,
38 |               --check-hidden,
39 |           ]
40 |         exclude: tests/data/hf_cache/tokenizer/gpt2.json
41 | 


--------------------------------------------------------------------------------
/megatron/mpu/random.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
16 | # TODO: should be able to get rid of this file entirely
17 | 
18 | import deepspeed
19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
20 | 
21 | # Default name for the model parallel rng tracker.
22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = (
23 |     deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
24 | )
25 | 
26 | # Whether apply model parallelsim to checkpointed hidden states.
27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
28 | 
29 | # RNG tracker object.
30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
31 | 
32 | # Deepspeed checkpointing functions
33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones
34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state
35 | checkpoint = checkpointing.checkpoint
36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
38 | 


--------------------------------------------------------------------------------
/tools/ckpts/upload.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import sys
17 | 
18 | from huggingface_hub import HfApi, create_repo
19 | 
20 | converted_ckpt = sys.argv[1]
21 | repo_name = sys.argv[2]
22 | branch_name = sys.argv[3]
23 | try:
24 |     create_repo(repo_name, repo_type="model", private=False)
25 | except:
26 |     print("repo {repo_name} already exists!")
27 |     pass
28 | 
29 | files = os.listdir(converted_ckpt)
30 | 
31 | api = HfApi()
32 | if branch_name != "main":
33 |     try:
34 |         api.create_branch(
35 |             repo_id=repo_name,
36 |             repo_type="model",
37 |             branch=branch_name,
38 |         )
39 |     except:
40 |         print(f"branch {branch_name} already exists, try again...")
41 | print(f"to upload: {files}")
42 | for file in files:
43 |     print(f"Uploading {file} to branch {branch_name}...")
44 |     api.upload_file(
45 |         path_or_fileobj=os.path.join(converted_ckpt, file),
46 |         path_in_repo=file,
47 |         repo_id=repo_name,
48 |         repo_type="model",
49 |         commit_message=f"Upload {file}",
50 |         revision=branch_name,
51 |     )
52 |     print(f"Successfully uploaded {file} !")
53 | 


--------------------------------------------------------------------------------
/README-MUP.md:
--------------------------------------------------------------------------------
 1 | # How to use Mup (https://github.com/microsoft/mup)
 2 | 
 3 | ## Add mup neox args to your config
 4 | 
 5 | ```
 6 | # mup
 7 | 
 8 | "use-mup": true,
 9 | 
10 | "save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank
11 | 
12 | "base-shapes-file": "base-shapes", # load base shapes from this file
13 | 
14 | "coord-check": false, # generate coord check plots to verify mup's implementation in neox
15 | 
16 | # mup hp search
17 | 
18 | "mup-init-scale": 1.0,
19 | 
20 | "mup-attn-temp": 1.0,
21 | 
22 | "mup-output-temp": 1.0,
23 | 
24 | "mup-embedding-mult": 1.0,
25 | 
26 | "mup-rp-embedding-mult": 1.0,
27 | ```
28 | 
29 | ## Generate base shapes
30 | 
31 | 1. Set use-mup to true
32 | 2. Set save-base-shapes to true
33 | 3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named <base-shapes-file>.<rank>. gpt-neox will exit immediately.
34 | 4. Set save-base-shapes to false
35 | 
36 | ## Generate coord check plots (optional)
37 | 
38 | 1. Keep use-mup true
39 | 2. Set coord-check to true
40 | 3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately
41 | 4. Set coord-check to false
42 | 
43 | ## Tune mup hyperparameters and LR
44 | 
45 | The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml).
46 | 
47 | ## Transfer
48 | 
49 | With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again.
50 | 


--------------------------------------------------------------------------------
/tests/unit/test_arguments.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from megatron.neox_arguments import NeoXArgs
16 | from tests.common import BASE_CONFIG, DistributedTest
17 | 
18 | 
19 | def test_main_constructor():
20 |     input_args = ["train.py", "tests/config/test_setup.yml"]
21 |     neox_args = NeoXArgs.consume_deepy_args(input_args)
22 |     deepspeed_main_args = neox_args.get_deepspeed_main_args()
23 |     neox_args = NeoXArgs.consume_neox_args(input_args=deepspeed_main_args)
24 |     neox_args.configure_distributed_args()
25 | 
26 | 
27 | class test_constructor_from_ymls_class(DistributedTest):
28 |     world_size = 2
29 | 
30 |     def test(self):
31 |         neox_args = NeoXArgs.from_ymls(["tests/config/test_setup.yml"])
32 |         neox_args.configure_distributed_args()
33 | 
34 | 
35 | def test_constructor_from_ymls():
36 |     t1 = test_constructor_from_ymls_class()
37 |     t1.test()
38 | 
39 | 
40 | class test_constructor_from_dict_class(DistributedTest):
41 |     world_size = 2
42 | 
43 |     def test(self):
44 |         neox_args = NeoXArgs.from_dict(BASE_CONFIG)
45 | 
46 | 
47 | def test_constructor_from_dict():
48 |     t1 = test_constructor_from_dict_class()
49 |     t1.test()
50 | 


--------------------------------------------------------------------------------
/post-training/llama_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datasets import load_dataset, DatasetDict
 4 | 
 5 | import jsonlines
 6 | 
 7 | ###############
 8 | # Load datasets
 9 | ###############
10 | raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
11 | # convert to just train and test, not necessary but it looks better
12 | raw_datasets = DatasetDict(
13 |     {
14 |         "train": raw_datasets["train_prefs"],
15 |         "test": raw_datasets["test_prefs"],
16 |     }
17 | )
18 | os.makedirs(os.path.join("data", "pairwise"), exist_ok=True)
19 | for split in ["train", "test"]:
20 |     with open(
21 |         os.path.join("data", "pairwise", f"llama3_dpo_{split}_filtered.jsonl"), "w"
22 |     ) as f:
23 |         writer = jsonlines.Writer(f)
24 |         for item in raw_datasets[split]:
25 |             item["chosen"] = item["chosen"]
26 |             item["rejected"] = item["rejected"]
27 |             writer.write(item)
28 | os.makedirs(os.path.join("data", "sft"), exist_ok=True)
29 | for split in ["train", "test"]:
30 |     with open(
31 |         os.path.join("data", "sft", f"llama3_sft_{split}_filtered.jsonl"), "w"
32 |     ) as f:
33 |         writer = jsonlines.Writer(f)
34 |         for item in raw_datasets[split]:
35 |             item["messages"] = item["chosen"]
36 |             writer.write(item)
37 | os.makedirs(os.path.join("data", "kto"), exist_ok=True)
38 | for split in ["train", "test"]:
39 |     with open(
40 |         os.path.join("data", "kto", f"llama3_kto_{split}_filtered.jsonl"), "w"
41 |     ) as f:
42 |         writer = jsonlines.Writer(f)
43 |         for item in raw_datasets[split]:
44 |             item["messages"] = item["chosen"]
45 |             item["reward"] = 1
46 |             writer.write(item)
47 |             item["messages"] = item["rejected"]
48 |             item["reward"] = -1
49 |             writer.write(item)
50 | 


--------------------------------------------------------------------------------
/megatron/neox_arguments/template.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | import logging
17 | 
18 | 
19 | @dataclass
20 | class NeoXArgsTemplate:
21 |     def defaults(self):
22 |         """
23 |         generator for getting default values.
24 |         """
25 |         for key, field_def in self.__dataclass_fields__.items():
26 |             yield key, field_def.default
27 | 
28 |     def update_value(self, key: str, value):
29 |         """
30 |         updates a property value if the key already exists
31 | 
32 |         Problem: a previously non-existing property can be added to the class instance without error.
33 |         """
34 |         if hasattr(self, key):
35 |             setattr(self, key, value)
36 |         else:
37 |             error_message = (
38 |                 self.__class__.__name__
39 |                 + ".update_value() to be updated property "
40 |                 + str(key)
41 |                 + " does not exist"
42 |             )
43 |             logging.error(error_message)
44 |             raise ValueError(error_message)
45 | 
46 |     def update_values(self, d):
47 |         """
48 |         Updates multiple values in self if the keys already exists
49 |         """
50 |         for k, v in d.items():
51 |             self.update_value(k, v)
52 | 


--------------------------------------------------------------------------------
/configs/slurm_125M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 1,
 3 |    "model_parallel_size": 1,
 4 |    "num_layers": 12,
 5 |    "hidden_size": 768,
 6 |    "num_attention_heads": 12,
 7 |    "seq_length": 2048,
 8 |    "max_position_embeddings": 2048,
 9 |    "norm": "layernorm",
10 |    "pos_emb": "rotary",
11 |    "no_weight_tying": true,
12 |    "scaled_upper_triang_masked_softmax_fusion": true,
13 |    "bias_gelu_fusion": true,
14 |    "rope_fusion": false,
15 |    "layernorm_fusion": false,
16 |    "optimizer": {
17 |      "type": "Adam",
18 |      "params": {
19 |        "lr": 0.0006,
20 |        "betas": [0.9, 0.999],
21 |        "eps": 1.0e-8
22 |      }
23 |    },
24 |    "zero_optimization": {
25 |     "stage": 0,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 500000000,
28 |     "overlap_comm": true,
29 |     "reduce_scatter": true,
30 |     "reduce_bucket_size": 500000000,
31 |     "contiguous_gradients": true
32 |   },
33 |    "train_micro_batch_size_per_gpu": 4,
34 |    "data_impl": "mmap",
35 |    "split": "949,50,1",
36 |    "checkpoint_activations": true,
37 |    "checkpoint_num_layers": 1,
38 |    "partition_activations": true,
39 |    "synchronize_each_layer": true,
40 |    "gradient_clipping": 1.0,
41 |    "weight_decay": 0.0,
42 |    "hidden_dropout": 0.0,
43 |    "attention_dropout": 0.0,
44 |    "fp16": {
45 |      "enabled": true,
46 |      "loss_scale": 0,
47 |      "loss_scale_window": 1000,
48 |      "hysteresis": 2,
49 |      "min_loss_scale": 1
50 |    },
51 |    "train_iters": 320000,
52 |    "lr_decay_iters": 320000,
53 |    "distributed_backend": "nccl",
54 |    "lr_decay_style": "cosine",
55 |    "warmup": 0.01,
56 |    "checkpoint_factor": 10000,
57 |    "eval_interval": 1000,
58 |    "eval_iters": 10,
59 |    "log_interval": 100,
60 |    "steps_per_print": 10,
61 |    "keep_last_n_checkpoints": 4,
62 |    "wall_clock_breakdown": true,
63 |    "launcher": "slurm",
64 |    "deepspeed_slurm": true,
65 |    "comment": "neox"
66 | }
67 | 


--------------------------------------------------------------------------------
/configs/llama/train_config.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # finetuning option
 3 |   "finetune": true,
 4 | 
 5 |   # init methods
 6 |   "init_method": "small_init",
 7 |   "output_layer_init_method": "wang_init",
 8 | 
 9 |   # optimizer settings
10 |   "optimizer": {
11 |     "type": "Adam",
12 |     "params": {
13 |      "lr": 0.0002,
14 |      "betas": [0.9, 0.95],
15 |      "eps":  1.0e-8,
16 |     }
17 |   },
18 |   "min_lr": 0.00002,
19 |   "override_lr_scheduler": true,
20 | 
21 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
22 |    "zero_optimization": {
23 |    "stage": 1,
24 |    "allgather_partitions": True,
25 |    "allgather_bucket_size": 500000000,
26 |    "overlap_comm": True,
27 |    "reduce_scatter": True,
28 |    "reduce_bucket_size": 500000000,
29 |    "contiguous_gradients": True,
30 |   },
31 | 
32 |   # batch / data settings
33 |   "train_micro_batch_size_per_gpu": 4,
34 |   "data_impl": "mmap",
35 | 
36 |   # activation checkpointing
37 |   "checkpoint_activations": true,
38 |   "checkpoint_num_layers": 1,
39 |   "partition_activations": true,
40 |   "synchronize_each_layer": true,
41 | 
42 |   # regularization
43 |   "gradient_clipping": 1.0,
44 |   "weight_decay": 0.1,
45 |   "hidden_dropout": 0,
46 |   "attention_dropout": 0,
47 | 
48 |   # precision settings
49 |   "fp16": {
50 |     "fp16": true,
51 |     "enabled": true,
52 |     "loss_scale": 0,
53 |     "loss_scale_window": 1000,
54 |     "hysteresis": 2,
55 |     "min_loss_scale": 1
56 |   },
57 | 
58 |   # misc. training settings
59 |   "train_iters": 320000,
60 |   "lr_decay_iters": 320000,
61 |   "distributed_backend": "nccl",
62 |   "lr_decay_style": "cosine",
63 |   "warmup": 0.01,
64 |   "checkpoint_factor": 10000,
65 |   "eval_interval": 1000,
66 |   "eval_iters": 10,
67 | 
68 |   # logging
69 |   "log_interval": 100,
70 |   "steps_per_print": 10,
71 |   "keep_last_n_checkpoints": 4,
72 |   "wall_clock_breakdown": true,
73 |   "mlp_multiple_of": 256,
74 | }
75 | 


--------------------------------------------------------------------------------
/configs/125M-json.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 12,
 6 |   "hidden_size": 768,
 7 |   "num_attention_heads": 12,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "norm": "layernorm",
11 |   "pos_emb": "rotary",
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": false,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": false,
17 |   "bias_gelu_fusion": false,
18 |   "rope_fusion": false,
19 |   "layernorm_fusion": false,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0006,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00006,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true
42 |   },
43 | 
44 |   "train_micro_batch_size_per_gpu": 4,
45 |   "data_impl": "mmap",
46 | 
47 |   "checkpoint_activations": true,
48 |   "checkpoint_num_layers": 1,
49 |   "partition_activations": true,
50 |   "synchronize_each_layer": true,
51 | 
52 |   "gradient_clipping": 1.0,
53 |   "weight_decay": 0.1,
54 |   "hidden_dropout": 0.0,
55 |   "attention_dropout": 0.0,
56 | 
57 |   "fp16": {
58 |     "enabled": true,
59 |     "loss_scale": 0,
60 |     "loss_scale_window": 1000,
61 |     "hysteresis": 2,
62 |     "min_loss_scale": 1
63 |   },
64 | 
65 |   "train_iters": 320000,
66 |   "lr_decay_iters": 320000,
67 |   "distributed_backend": "nccl",
68 |   "lr_decay_style": "cosine",
69 |   "warmup": 0.01,
70 |   "checkpoint_factor": 10000,
71 |   "eval_interval": 1000,
72 |   "eval_iters": 10,
73 | 
74 |   "log_interval": 100,
75 |   "steps_per_print": 10,
76 |   "keep_last_n_checkpoints": 4,
77 |   "wall_clock_breakdown": true,
78 | 
79 |   "hostfile": "/mock_path"
80 | }
81 | 


--------------------------------------------------------------------------------
/configs/autotuning_configs/tune_6-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 8,
 4 | 
 5 |    "num-layers": 32,
 6 |    "hidden-size": 4096,
 7 |    "num-attention-heads": 32,
 8 |    "seq-length": 2048,
 9 |    "max-position-embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos-emb": "rotary",
12 |    "no-weight-tying": true,
13 | 
14 |    "scaled-upper-triang-masked-softmax-fusion": false,
15 |    "bias-gelu-fusion": false,
16 | 
17 | 
18 |    "optimizer": {
19 |      "type": "Adam",
20 |      "params": {
21 |        "lr": 0.00012,
22 |        "betas": [0.9, 0.999],
23 |        "eps": 1.0e-8
24 |      }
25 |    },
26 | 
27 |    "train_micro_batch_size_per_gpu": 1,
28 |    "zero_optimization": {
29 |       "stage": [0, 1, 2, 3]
30 |    },
31 |    "data-impl": "mmap",
32 |    "split": "949,50,1",
33 | 
34 |    "checkpoint-activations": true,
35 |    "checkpoint-num-layers": 1,
36 |    "partition-activations": true,
37 |    "synchronize-each-layer": true,
38 | 
39 |    "gradient_clipping": 1.0,
40 |    "weight-decay": 0,
41 |    "hidden-dropout": 0,
42 |    "attention-dropout": 0,
43 | 
44 |    "fp16": {
45 |      "fp16": true,
46 |      "enabled": true,
47 |      "loss_scale": 0,
48 |      "loss_scale_window": 1000,
49 |      "hysteresis": 2,
50 |      "min_loss_scale": 1
51 |    },
52 | 
53 |    "train-iters": 100,
54 |    "lr-decay-iters": 320000,
55 |    "distributed-backend": "nccl",
56 |    "lr-decay-style": "cosine",
57 |    "warmup": 0.01,
58 |    "checkpoint-factor": 10000,
59 |    "eval-interval": 1000,
60 |    "eval-iters": 10,
61 |    "log-interval": 100,
62 |    "steps_per_print": 10,
63 |    "keep-last-n-checkpoints": 4,
64 |    "wall_clock_breakdown": true,
65 |    "launcher": "slurm",
66 |    "deepspeed_slurm": true,
67 |    "no_ssh_check": true,
68 |    "comment": "neox",
69 |    "autotuning": {
70 |        "enabled": true,
71 |        "mp_size": 8,
72 |        "arg_mappings": {
73 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
74 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
75 |      }
76 |    }
77 | }
78 | 


--------------------------------------------------------------------------------
/configs/gmlp_small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 |    "attention_config": [[["gmlp"], "all"]],
 8 | 
 9 | 
10 |    # model settings
11 |    "num_layers": 12,
12 |    "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
13 |    "gmlp_attn_dim": 64,
14 |    "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
15 |    "seq_length": 2048,
16 |    "max_position_embeddings": 2048,
17 |    "norm": "layernorm",
18 |    "pos_emb": "none",
19 |    "no_weight_tying": true,
20 | 
21 |    # optimizer settings
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.0006,
26 |        "betas": [0.9, 0.999],
27 |        "eps": 1.0e_8,
28 |      }
29 |    },
30 | 
31 |    # batch / data settings
32 |    "train_micro_batch_size_per_gpu": 4,
33 |    "data_impl": "mmap",
34 |    "split": "949,50,1",
35 | 
36 |    # activation checkpointing
37 |    "checkpoint_activations": true,
38 |    "checkpoint_num_layers": 1,
39 |    "partition_activations": false,
40 |    "synchronize_each_layer": true,
41 | 
42 |    # regularization
43 |    "gradient_clipping": 1.0,
44 |    "weight_decay": 0.1,
45 |    "hidden_dropout": 0.0,
46 |    "attention_dropout": 0.0,
47 | 
48 |    # precision settings
49 |    "fp16": {
50 |      "enabled": true,
51 |      "loss_scale": 0,
52 |      "loss_scale_window": 1000,
53 |      "hysteresis": 2,
54 |      "min_loss_scale": 1
55 |    },
56 | 
57 |    # misc. training settings
58 |    "train_iters": 320000,
59 |    "lr_decay_iters": 320000,
60 |    "distributed_backend": "nccl",
61 |    "lr_decay_style": "cosine",
62 |    "warmup": 0.01,
63 |    "checkpoint_factor": 10000,
64 |    "eval_interval": 1000,
65 |    "eval_iters": 10,
66 | 
67 |    # logging
68 |    "log_interval": 100,
69 |    "steps_per_print": 10,
70 |    "keep_last_n_checkpoints": 4,
71 |    "wall_clock_breakdown": true,
72 | }
73 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI contributors
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from typing import Optional
19 | from torch import Tensor
20 | 
21 | # flags required to enable jit fusion kernels
22 | torch._C._jit_set_profiling_mode(False)
23 | torch._C._jit_set_profiling_executor(False)
24 | torch._C._jit_override_can_fuse_on_cpu(True)
25 | torch._C._jit_override_can_fuse_on_gpu(True)
26 | 
27 | 
28 | def bias_dropout_add(
29 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool
30 | ) -> Tensor:
31 |     out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
32 |     if residual is not None:
33 |         out = residual + out
34 |     return out
35 | 
36 | 
37 | def get_bias_dropout_add(training):
38 |     def _bias_dropout_add(x, bias, residual, prob):
39 |         return bias_dropout_add(x, bias, residual, prob, training)
40 | 
41 |     return _bias_dropout_add
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_fused_train(
46 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add(x, bias, residual, prob, True)
49 | 
50 | 
51 | @torch.jit.script
52 | def bias_dropout_add_fused_inference(
53 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
54 | ) -> Tensor:
55 |     return bias_dropout_add(x, bias, residual, prob, False)
56 | 


--------------------------------------------------------------------------------
/configs/autotuning_configs/tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 |    "num-layers": 12,
 5 |    "hidden-size": 768,
 6 |    "num-attention-heads": 12,
 7 |    "seq-length": 2048,
 8 |    "max-position-embeddings": 2048,
 9 |    "norm": "layernorm",
10 |    "pos-emb": "rotary",
11 |    "no-weight-tying": true,
12 |    "scaled-upper-triang-masked-softmax-fusion": true,
13 |    "bias-gelu-fusion": true,
14 |    "optimizer": {
15 |      "type": "Adam",
16 |      "params": {
17 |        "lr": 0.0006,
18 |        "betas": [0.9, 0.999],
19 |        "eps": 1.0e-8
20 |      }
21 |    },
22 |    "zero_optimization": {
23 |     "stage": 0,
24 |     "allgather_partitions": true,
25 |     "allgather_bucket_size": 500000000,
26 |     "overlap_comm": true,
27 |     "reduce_scatter": true,
28 |     "reduce_bucket_size": 500000000,
29 |     "contiguous_gradients": true,
30 |     "cpu_offload": false
31 |   },
32 |    "train_micro_batch_size_per_gpu": 1,
33 |    "autotuning_config": {
34 |      "enabled": true,
35 |      "arg_mappings": {
36 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
37 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
38 |      }
39 |    },
40 |    "data-impl": "mmap",
41 |    "split": "949,50,1",
42 |    "checkpoint-activations": true,
43 |    "checkpoint-num-layers": 1,
44 |    "partition-activations": true,
45 |    "synchronize-each-layer": true,
46 |    "gradient_clipping": 1.0,
47 |    "weight-decay": 0.0,
48 |    "hidden-dropout": 0.0,
49 |    "attention-dropout": 0.0,
50 |    "fp16": {
51 |      "enabled": true,
52 |      "loss_scale": 0,
53 |      "loss_scale_window": 1000,
54 |      "hysteresis": 2,
55 |      "min_loss_scale": 1
56 |    },
57 |    "train-iters": 200,
58 |    "lr-decay-iters": 320000,
59 |    "distributed-backend": "nccl",
60 |    "lr-decay-style": "cosine",
61 |    "warmup": 0.01,
62 |    "save-interval": 10000,
63 |    "eval-interval": 1000,
64 |    "eval-iters": 10,
65 |    "log-interval": 100,
66 |    "steps_per_print": 10,
67 |    "keep-last-n-checkpoints": 4,
68 |    "wall_clock_breakdown": true,
69 |    "launcher": "slurm",
70 |    "deepspeed_slurm": true,
71 |    "comment": "neox"
72 | }
73 | 


--------------------------------------------------------------------------------
/configs/pythia/70M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 6,
 6 |   "hidden_size": 512,
 7 |   "num_attention_heads": 8,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 6]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.001,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.0001,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 32,
46 |   "data_impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   "checkpoint_activations": true,
50 |   "checkpoint_num_layers": 1,
51 |   "partition_activations": true,
52 |   "synchronize_each_layer": true,
53 | 
54 |   "gradient_clipping": 1.0,
55 |   "weight_decay": 0.1,
56 |   "hidden_dropout": 0,
57 |   "attention_dropout": 0,
58 | 
59 |   "fp16": {
60 |     "fp16": true,
61 |     "enabled": true,
62 |     "loss_scale": 0,
63 |     "loss_scale_window": 1000,
64 |     "initial_scale_power": 12,
65 |     "hysteresis": 2,
66 |     "min_loss_scale": 1
67 |   },
68 | 
69 |   "train_iters": 143000,
70 |   "lr_decay_iters": 143000,
71 |   "distributed_backend": "nccl",
72 |   "lr_decay_style": "cosine",
73 |   "warmup": 0.01,
74 |   "checkpoint_factor": 1000,
75 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 |   "eval_interval": 100000,
77 |   "eval_iters": 10,
78 | 
79 |   "log_interval": 10,
80 |   "steps_per_print": 10,
81 |   "wall_clock_breakdown": true,
82 | 
83 |   "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/pythia/160M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 12,
 6 |   "hidden_size": 768,
 7 |   "num_attention_heads": 12,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 12]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0006,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00006,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 32,
46 |   "data_impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   "checkpoint_activations": true,
50 |   "checkpoint_num_layers": 1,
51 |   "partition_activations": true,
52 |   "synchronize_each_layer": true,
53 | 
54 |   "gradient_clipping": 1.0,
55 |   "weight_decay": 0.1,
56 |   "hidden_dropout": 0,
57 |   "attention_dropout": 0,
58 | 
59 |   "fp16": {
60 |     "fp16": true,
61 |     "enabled": true,
62 |     "loss_scale": 0,
63 |     "loss_scale_window": 1000,
64 |     "initial_scale_power": 12,
65 |     "hysteresis": 2,
66 |     "min_loss_scale": 1
67 |   },
68 | 
69 |   "train_iters": 143000,
70 |   "lr_decay_iters": 143000,
71 |   "distributed_backend": "nccl",
72 |   "lr_decay_style": "cosine",
73 |   "warmup": 0.01,
74 |   "checkpoint_factor": 1000,
75 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 |   "eval_interval": 143000,
77 |   "eval_iters": 10,
78 | 
79 |   "log_interval": 10,
80 |   "steps_per_print": 10,
81 |   "wall_clock_breakdown": true,
82 | 
83 |   "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/pythia/1-4B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 24,
 6 |   "hidden_size": 2048,
 7 |   "num_attention_heads": 16,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 24]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0002,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00002,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 16,
46 |   "data_impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   "checkpoint_activations": true,
50 |   "checkpoint_num_layers": 1,
51 |   "partition_activations": true,
52 |   "synchronize_each_layer": true,
53 | 
54 |   "gradient_clipping": 1.0,
55 |   "weight_decay": 0.1,
56 |   "hidden_dropout": 0,
57 |   "attention_dropout": 0,
58 | 
59 |   "fp16": {
60 |     "fp16": true,
61 |     "enabled": true,
62 |     "loss_scale": 0,
63 |     "loss_scale_window": 1000,
64 |     "initial_scale_power": 12,
65 |     "hysteresis": 2,
66 |     "min_loss_scale": 1
67 |   },
68 | 
69 |   "train_iters": 143000,
70 |   "lr_decay_iters": 143000,
71 |   "distributed_backend": "nccl",
72 |   "lr_decay_style": "cosine",
73 |   "warmup": 0.01,
74 |   "checkpoint_factor": 1000,
75 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 |   "eval_interval": 143000,
77 |   "eval_iters": 10,
78 | 
79 | 
80 |   "log_interval": 10,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 |   "tokenizer_type": "HFTokenizer"
84 |   }
85 | 


--------------------------------------------------------------------------------
/configs/pythia/410M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 24,
 6 |   "hidden_size": 1024,
 7 |   "num_attention_heads": 16,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 24]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0003,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00003,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 32,
46 |   "data_impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   "checkpoint_activations": true,
50 |   "checkpoint_num_layers": 1,
51 |   "partition_activations": true,
52 |   "synchronize_each_layer": true,
53 | 
54 |   "gradient_clipping": 1.0,
55 |   "weight_decay": 0.1,
56 |   "hidden_dropout": 0,
57 |   "attention_dropout": 0,
58 | 
59 |   "fp16": {
60 |     "fp16": true,
61 |     "enabled": true,
62 |     "loss_scale": 0,
63 |     "loss_scale_window": 1000,
64 |     "initial_scale_power": 12,
65 |     "hysteresis": 2,
66 |     "min_loss_scale": 1
67 |   },
68 | 
69 |   "train_iters": 143000,
70 |   "lr_decay_iters": 143000,
71 |   "distributed_backend": "nccl",
72 |   "lr_decay_style": "cosine",
73 |   "warmup": 0.01,
74 |   "checkpoint_factor": 1000,
75 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 |   "eval_interval": 143000,
77 |   "eval_iters": 10,
78 | 
79 |   "log_interval": 10,
80 |   "steps_per_print": 10,
81 |   "wall_clock_breakdown": true,
82 | 
83 |   "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/pythia/6-9B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 1,
 3 |    "model_parallel_size": 2,
 4 | 
 5 |    "num_layers": 32,
 6 |    "hidden_size": 4096,
 7 |    "num_attention_heads": 32,
 8 |    "seq_length": 2048,
 9 |    "max_position_embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos_emb": "rotary",
12 |    "rotary_pct": 0.25,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": true,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], 32]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": true,
21 | 
22 | 
23 |    "optimizer": {
24 |      "type": "Adam",
25 |      "params": {
26 |        "lr": 0.00012,
27 |        "betas": [0.9, 0.95],
28 |        "eps": 1.0e-8
29 |      }
30 |    },
31 | 
32 |    "min_lr": 0.000012,
33 | 
34 |    "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 1260000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 1260000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |    "train_micro_batch_size_per_gpu": 8,
46 |    "gradient_accumulation_steps": 2,
47 |    "data_impl": "mmap",
48 | 
49 |    "checkpoint_activations": true,
50 |    "checkpoint_num_layers": 1,
51 |    "partition_activations": true,
52 |    "synchronize_each_layer": true,
53 | 
54 |    "gradient_clipping": 1.0,
55 |    "weight_decay": 0.1,
56 |    "hidden_dropout": 0,
57 |    "attention_dropout": 0,
58 | 
59 |    "fp16": {
60 |      "fp16": true,
61 |      "enabled": true,
62 |      "loss_scale": 0,
63 |      "loss_scale_window": 1000,
64 |      "initial_scale_power": 12,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    "train_iters": 143000,
70 |    "lr_decay_iters": 143000,
71 |    "distributed_backend": "nccl",
72 |    "lr_decay_style": "cosine",
73 |    "warmup": 0.01,
74 |    "checkpoint_factor": 1000,
75 |    "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 |    "eval_interval": 143000,
77 |    "eval_iters": 10,
78 | 
79 |    "log_interval": 10,
80 |    "steps_per_print": 10,
81 |    "wall_clock_breakdown": true,
82 | 
83 |    "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/autotuning_configs/small_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 | 
 5 |    "num-layers": 12,
 6 |    "hidden-size": 768,
 7 |    "num-attention-heads": 12,
 8 |    "seq-length": 2048,
 9 |    "max-position-embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos-emb": "rotary",
12 |    "no-weight-tying": true,
13 | 
14 |    "scaled-upper-triang-masked-softmax-fusion": false,
15 |    "bias-gelu-fusion": false,
16 | 
17 | 
18 |    "optimizer": {
19 |      "type": "Adam",
20 |      "params": {
21 |        "lr": 0.0006,
22 |        "betas": [0.9, 0.999],
23 |        "eps": 1.0e-8
24 |      }
25 |    },
26 | 
27 |    "train_micro_batch_size_per_gpu": 1,
28 |    "data-impl": "mmap",
29 |    "split": "949,50,1",
30 | 
31 |    "checkpoint-activations": true,
32 |    "checkpoint-num-layers": 1,
33 |    "partition-activations": true,
34 |    "synchronize-each-layer": true,
35 | 
36 |    "gradient_clipping": 1.0,
37 |    "weight-decay": 0.0,
38 |    "hidden-dropout": 0.0,
39 |    "attention-dropout": 0.0,
40 | 
41 |    "fp16": {
42 |      "enabled": true,
43 |      "loss_scale": 0,
44 |      "loss_scale_window": 1000,
45 |      "hysteresis": 2,
46 |      "min_loss_scale": 1
47 |    },
48 | 
49 |    "train-iters": 320000,
50 |    "lr-decay-iters": 320000,
51 |    "distributed-backend": "nccl",
52 |    "lr-decay-style": "cosine",
53 |    "warmup": 0.01,
54 |    "save-interval": 10000,
55 |    "eval-interval": 1000,
56 |    "eval-iters": 10,
57 | 
58 |    "log-interval": 100,
59 |    "steps_per_print": 10,
60 |    "keep-last-n-checkpoints": 4,
61 |    "wall_clock_breakdown": true,
62 |    "launcher": "slurm",
63 |    "deepspeed_slurm": true,
64 |    "comment": "neox",
65 |    "autotuning": {
66 |        "enabled": true,
67 |        "arg_mappings": {
68 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
69 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
70 |      }
71 |    },
72 |    "zero_optimization": {
73 |       "stage": [0, 1, 2, 3]
74 |    },
75 |   "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
76 |   "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
77 |   "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
78 | }
79 | 


--------------------------------------------------------------------------------
/configs/pythia/12B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 1,
 3 |    "model_parallel_size": 4,
 4 | 
 5 |    "num_layers": 36,
 6 |    "hidden_size": 5120,
 7 |    "num_attention_heads": 40,
 8 |    "seq_length": 2048,
 9 |    "max_position_embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos_emb": "rotary",
12 |    "rotary_pct": 0.25,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": true,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], 36]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": true,
21 | 
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.00012,
26 |        "betas": [0.9, 0.95],
27 |        "eps": 1.0e-8
28 |      }
29 |    },
30 |    "min_lr": 0.000012,
31 | 
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 1260000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 1260000000,
39 |     "contiguous_gradients": true,
40 |     "cpu_offload": false
41 |   },
42 | 
43 |    "train_micro_batch_size_per_gpu": 8,
44 |    "gradient_accumulation_steps": 2,
45 |    "data_impl": "mmap",
46 | 
47 |    "checkpoint_activations": true,
48 |    "checkpoint_num_layers": 1,
49 |    "partition_activations": true,
50 |    "synchronize_each_layer": true,
51 | 
52 |    "gradient_clipping": 1.0,
53 |    "weight_decay": 0.1,
54 |    "hidden_dropout": 0,
55 |    "attention_dropout": 0,
56 | 
57 |    "fp16": {
58 |      "fp16": true,
59 |      "enabled": true,
60 |      "loss_scale": 0,
61 |      "loss_scale_window": 1000,
62 |      "initial_scale_power": 12,
63 |      "hysteresis": 2,
64 |      "min_loss_scale": 1
65 |    },
66 | 
67 |    "train_iters": 143000,
68 |    "lr_decay_iters": 143000,
69 |    "distributed_backend": "nccl",
70 |    "lr_decay_style": "cosine",
71 |    "warmup": 0.01,
72 |    "checkpoint_factor": 1000,
73 |    "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
74 |    "eval_interval": 143000,
75 |    "eval_iters": 10,
76 | 
77 |    "log_interval": 10,
78 |    "steps_per_print": 10,
79 |    "wall_clock_breakdown": true,
80 | 
81 |    "log_grad_norm": true,
82 | 
83 |    "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/pythia/1B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 16,
 6 |   "hidden_size": 2048,
 7 |   "num_attention_heads": 8,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": true,
17 |   "bias_gelu_fusion": true,
18 | 
19 |   "init_method": "small_init",
20 |   "output_layer_init_method": "wang_init",
21 | 
22 |   "optimizer": {
23 |     "type": "Adam",
24 |     "params": {
25 |       "lr": 0.00025,
26 |       "betas": [0.9, 0.95],
27 |       "eps": 1.0e-8
28 |     }
29 |   },
30 |   "min_lr": 0.000025,
31 | 
32 |   "zero_optimization": {
33 |     "stage": 0,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": true,
40 |     "cpu_offload": false
41 |   },
42 | 
43 |   "fp16": {
44 |     "enabled": true,
45 |     "type": "bfloat16",
46 |     "auto_cast": true,
47 |     "loss_scale": 0,
48 |     "loss_scale_window": 1000,
49 |     "initial_scale_power": 12,
50 |     "hysteresis": 2,
51 |     "min_loss_scale": 1
52 |   },
53 | 
54 |   "fp32_allreduce": true,
55 | 
56 |   "train_micro_batch_size_per_gpu": 4,
57 |   "gradient_accumulation_steps": 4,
58 |   "data_impl": "mmap",
59 |   "num_workers": 1,
60 | 
61 |   "checkpoint_activations": true,
62 |   "checkpoint_num_layers": 1,
63 |   "partition_activations": true,
64 |   "synchronize_each_layer": true,
65 | 
66 |   "gradient_clipping": 1.0,
67 |   "weight_decay": 0.1,
68 |   "hidden_dropout": 0,
69 |   "attention_dropout": 0,
70 | 
71 |   "train_iters": 143000,
72 |   "lr_decay_iters": 143000,
73 |   "distributed_backend": "nccl",
74 |   "lr_decay_style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint_factor": 1000,
77 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
78 |   "eval_interval": 143000,
79 |   "eval_iters": 10,
80 | 
81 |   "log_interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | 
85 |   "tokenizer_type": "HFTokenizer"
86 | }
87 | 


--------------------------------------------------------------------------------
/configs/pythia/2-8B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 32,
 6 |   "hidden_size": 2560,
 7 |   "num_attention_heads": 32,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 32]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.00016,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.000016,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 8,
46 |   "gradient_accumulation_steps": 2,
47 |   "data_impl": "mmap",
48 |   "num_workers": 1,
49 | 
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   "gradient_clipping": 1.0,
56 |   "weight_decay": 0.1,
57 |   "hidden_dropout": 0,
58 |   "attention_dropout": 0,
59 | 
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   "train_iters": 143000,
71 |   "lr_decay_iters": 143000,
72 |   "distributed_backend": "nccl",
73 |   "lr_decay_style": "cosine",
74 |   "warmup": 0.01,
75 |   "checkpoint_factor": 1000,
76 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 |   "eval_interval": 40000,
78 |   "eval_iters": 10,
79 | 
80 |   "log_grad_norm": true,
81 | 
82 |   "log_interval": 10,
83 |   "steps_per_print": 10,
84 |   "wall_clock_breakdown": true,
85 | 
86 |   "tokenizer_type": "HFTokenizer"
87 | }
88 | 


--------------------------------------------------------------------------------
/configs/800M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   # model settings
 6 |   "num_layers": 16,
 7 |   "hidden_size": 2048,
 8 |   "num_attention_heads": 8,
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "pos_emb": "rotary",
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": false,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": false,
17 |   "bias_gelu_fusion": false,
18 |   "rope_fusion": false,
19 |   "layernorm_fusion": false,
20 | 
21 |   # init methods
22 |   "init_method": "small_init",
23 |   "output_layer_init_method": "wang_init",
24 | 
25 |   "optimizer": {
26 |     "type": "Adam",
27 |     "params": {
28 |       "lr": 0.00025,
29 |       "betas": [0.9, 0.95],
30 |       "eps": 1.0e-8,
31 |     }
32 |   },
33 |   "min_lr": 0.000025,
34 | 
35 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": True,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": True,
41 |     "reduce_scatter": True,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": True,
44 |   },
45 | 
46 |   "train_micro_batch_size_per_gpu": 16,
47 |   "gradient_accumulation_steps": 1,
48 |   "data_impl": "mmap",
49 |   "num_workers": 1,
50 | 
51 |   # activation checkpointing
52 |   "checkpoint_activations": true,
53 |   "checkpoint_num_layers": 1,
54 |   "partition_activations": true,
55 |   "synchronize_each_layer": true,
56 | 
57 |   # regularization
58 |   "gradient_clipping": 1.0,
59 |   "weight_decay": 0.1,
60 |   "hidden_dropout": 0,
61 |   "attention_dropout": 0,
62 | 
63 |   # precision settings
64 |   "fp16": {
65 |     "fp16": true,
66 |     "enabled": true,
67 |     "loss_scale": 0,
68 |     "loss_scale_window": 1000,
69 |     "initial_scale_power": 12,
70 |     "hysteresis": 2,
71 |     "min_loss_scale": 1,
72 |   },
73 | 
74 |   "train_iters": 143000,
75 |   "lr_decay_iters": 143000,
76 |   "distributed_backend": "nccl",
77 |   "lr_decay_style": "cosine",
78 |   "warmup": 0.01,
79 |   "checkpoint_factor": 1000,
80 |   "eval_interval": 40000,
81 |   "eval_iters": 10,
82 | 
83 |   "log_interval": 10,
84 |   "steps_per_print": 10,
85 |   "wall_clock_breakdown": true,
86 | }
87 | 


--------------------------------------------------------------------------------
/tests/config/test_setup.yml:
--------------------------------------------------------------------------------
 1 | # 19M parameter model, & local setup with some additional simplifications
 2 | {
 3 |   # Settings to make the test setup as lightweight as possible
 4 |   "data_path": "data/enwik8/enwik8_text_document",
 5 |   "vocab_file": "data/gpt2-vocab.json",
 6 |   "merge_file": "data/gpt2-merges.txt",
 7 |   "lr_decay_iters": 20,
 8 |   "train_iters": 20,
 9 |   "hostfile": "None",
10 |   "include": "localhost:1",
11 |   "use_wandb": False,
12 | 
13 |   # Settings copied from 19M parameter config (some modifications above, meaning we can't use configs/19M.yml directly)
14 |   "pipe_parallel_size": 1,
15 |   "model_parallel_size": 1,
16 | 
17 |   # model settings
18 |   "num_layers": 2,
19 |   "hidden_size": 8,
20 |   "num_attention_heads": 4,
21 |   "seq_length": 1024,
22 |   "max_position_embeddings": 1024,
23 |   "pos_emb": "rotary",
24 |   "no_weight_tying": true,
25 |   "gpt_j_residual": false,
26 |   "output_layer_parallelism": "column",
27 | 
28 |   "scaled_upper_triang_masked_softmax_fusion": false,
29 |   "bias_gelu_fusion": false,
30 |   "rope_fusion": false,
31 |   "layernorm_fusion": false,
32 | 
33 |   # Optimizer
34 |   "optimizer": {
35 |     "type": "sm3",
36 |     "params": {},
37 |   },
38 | 
39 |   # precision
40 |   "precision": "fp16",
41 | 
42 |   # init methods
43 |   "init_method": "small_init",
44 |   "output_layer_init_method": "wang_init",
45 | 
46 |   "train_micro_batch_size_per_gpu": 4,
47 |   "gradient_accumulation_steps": 1,
48 |   "data_impl": "mmap",
49 |   "num_workers": 1,
50 | 
51 |   # activation checkpointing
52 |   "checkpoint_activations": true,
53 |   "checkpoint_num_layers": 1,
54 |   "partition_activations": true,
55 |   "synchronize_each_layer": true,
56 | 
57 |   # regularization
58 |   "gradient_clipping": 1.0,
59 |   "weight_decay": 0.1,
60 |   "hidden_dropout": 0,
61 |   "attention_dropout": 0,
62 | 
63 |   "distributed_backend": "nccl",
64 |   "lr_decay_style": "cosine",
65 |   "warmup": 0.01,
66 |   "checkpoint_factor": 1000,
67 |   "eval_interval": 100000,
68 |   "eval_iters": 10,
69 | 
70 |   "log_interval": 10,
71 |   "steps_per_print": 10,
72 |   "wall_clock_breakdown": true,
73 | 
74 |   # additional deepspeed args not specified above
75 |   "deepspeed_extra_args": {
76 |     "comms_logger": {
77 |         "enabled": true,
78 |         "verbose": true,
79 |         "prof_all": true,
80 |         "debug": false
81 |     },
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/configs/finetuning_configs/6-9B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # finetuning option
 3 |   "load": "/path/to/checkpoint",
 4 |   "finetune": true,
 5 | 
 6 |   "pipe-parallel-size": 1,
 7 |   "model-parallel-size": 2,
 8 | 
 9 |    "num-layers": 32,
10 |    "hidden-size": 4096,
11 |    "num-attention-heads": 32,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "rotary_pct": 0.25,
17 |    "no-weight-tying": true,
18 |    "gpt_j_residual": true,
19 |    "output_layer_parallelism": "column",
20 | 
21 |    "attention-config": [[["flash"], 32]],
22 | 
23 |    "scaled-upper-triang-masked-softmax-fusion": true,
24 |    "bias-gelu-fusion": true,
25 | 
26 | 
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.00012,
31 |        "betas": [0.9, 0.95],
32 |        "eps": 1.0e-8
33 |      }
34 |    },
35 | 
36 |    "min_lr": 0.000012,
37 | 
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": true,
41 |     "allgather_bucket_size": 1260000000,
42 |     "overlap_comm": true,
43 |     "reduce_scatter": true,
44 |     "reduce_bucket_size": 1260000000,
45 |     "contiguous_gradients": true,
46 |     "cpu_offload": false,
47 |     "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
48 |   },
49 | 
50 |    "train_micro_batch_size_per_gpu": 8,
51 |    "gradient_accumulation_steps": 2,
52 |    "data-impl": "mmap",
53 | 
54 |    "checkpoint-activations": true,
55 |    "checkpoint-num-layers": 1,
56 |    "partition-activations": true,
57 |    "synchronize-each-layer": true,
58 | 
59 |    "gradient_clipping": 1.0,
60 |    "weight-decay": 0.1,
61 |    "hidden-dropout": 0,
62 |    "attention-dropout": 0,
63 | 
64 |    "fp16": {
65 |      "fp16": true,
66 |      "enabled": true,
67 |      "loss_scale": 0,
68 |      "loss_scale_window": 1000,
69 |      "initial_scale_power": 12,
70 |      "hysteresis": 2,
71 |      "min_loss_scale": 1
72 |    },
73 | 
74 |    "train-iters": 143000,
75 |    "lr-decay-iters": 143000,
76 |    "distributed-backend": "nccl",
77 |    "lr-decay-style": "cosine",
78 |    "warmup": 0.01,
79 |    "checkpoint-factor": 1000,
80 |    "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
81 |    "eval-interval": 143000,
82 |    "eval-iters": 10,
83 | 
84 |    "log-interval": 10,
85 |    "steps_per_print": 10,
86 |    "wall_clock_breakdown": true,
87 | 
88 |    "tokenizer_type": "HFTokenizer"
89 | }
90 | 


--------------------------------------------------------------------------------
/configs/autotuning_configs/tune_1-3B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 | 
 5 |    "num-layers": 24,
 6 |    "hidden-size": 2048,
 7 |    "num-attention-heads": 16,
 8 |    "seq-length": 2048,
 9 |    "max-position-embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos-emb": "rotary",
12 |    "no-weight-tying": true,
13 |    "gpt_j_residual": false,
14 |    "output_layer_parallelism": "column",
15 |    "attention_config": [[["flash"], 24]],
16 |    "scaled-upper-triang-masked-softmax-fusion": false,
17 |    "bias-gelu-fusion": false,
18 | 
19 |    "init_method": "small_init",
20 |    "output_layer_init_method": "wang_init",
21 | 
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.0002,
26 |        "betas": [0.9, 0.95],
27 |        "eps":  1.0e-8
28 |      }
29 |    },
30 |    "min_lr": 0.00002,
31 | 
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": true
40 |   },
41 |   "train_micro_batch_size_per_gpu": 1,
42 |    "autotuning": {
43 |      "enabled": true,
44 |      "arg_mappings": {
45 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
46 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
47 |      }
48 |    },
49 |    "data-impl": "mmap",
50 | 
51 |    "checkpoint-activations": false,
52 |    "checkpoint-num-layers": 1,
53 |    "partition-activations": true,
54 |    "synchronize-each-layer": true,
55 | 
56 |    "gradient_clipping": 1.0,
57 |    "weight-decay": 0.1,
58 |    "hidden-dropout": 0,
59 |    "attention-dropout": 0,
60 | 
61 |    "fp16": {
62 |      "fp16": true,
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "checkpoint-factor": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 |    "launcher": "slurm",
79 |    "deepspeed_slurm": true,
80 |    "no_ssh_check": true,
81 | 
82 |    "log-interval": 10,
83 |    "steps_per_print": 10,
84 |    "keep-last-n-checkpoints": 1,
85 |    "wall_clock_breakdown": true
86 | }
87 | 


--------------------------------------------------------------------------------
/configs/bf16_125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 12,
10 |    "hidden_size": 768,
11 |    "num_attention_heads": 12,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled_upper_triang_masked_softmax_fusion": false,
20 |    "bias_gelu_fusion": false,
21 |    "rope_fusion": false,
22 |    "layernorm_fusion": false,
23 | 
24 | 
25 |    # optimizer settings
26 |    "optimizer": {
27 |      "type": "Adam",
28 |      "params": {
29 |        "lr": 0.0006,
30 |        "betas": [0.9, 0.999],
31 |        "eps": 1.0e-8,
32 |      }
33 |    },
34 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
35 |    "zero_optimization": {
36 |     "stage": 0,
37 |     "allgather_partitions": True,
38 |     "allgather_bucket_size": 500000000,
39 |     "overlap_comm": True,
40 |     "reduce_scatter": True,
41 |     "reduce_bucket_size": 500000000,
42 |     "contiguous_gradients": True,
43 |   },
44 | 
45 |    # batch / data settings
46 |    "train_micro_batch_size_per_gpu": 4,
47 |    "data_impl": "mmap",
48 |    "split": "949,50,1",
49 | 
50 |    # activation checkpointing
51 |    "checkpoint_activations": true,
52 |    "checkpoint_num_layers": 1,
53 |    "partition_activations": true,
54 |    "synchronize_each_layer": true,
55 | 
56 |    # regularization
57 |    "gradient_clipping": 1.0,
58 |    "weight_decay": 0.0,
59 |    "hidden_dropout": 0.0,
60 |    "attention_dropout": 0.0,
61 | 
62 |    "precision": "bfloat16",
63 | 
64 |    "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
65 |    # misc. training settings
66 |    "train_iters": 320000,
67 |    "lr_decay_iters": 320000,
68 |    "distributed_backend": "nccl",
69 |    "lr_decay_style": "cosine",
70 |    "warmup": 0.01,
71 |    "checkpoint_factor": 10000,
72 |    "eval_interval": 1000,
73 |    "eval_iters": 10,
74 | 
75 |    # logging
76 |    "log_interval": 100,
77 |    "steps_per_print": 10,
78 |    "keep_last_n_checkpoints": 4,
79 |    "wall_clock_breakdown": true,
80 | }
81 | 


--------------------------------------------------------------------------------
/post-training/recreating_zephyr_dpo.md:
--------------------------------------------------------------------------------
 1 | # Initial setup
 2 | 
 3 | ```bash
 4 | python tools/ckpts/convert_hf_llama_to_neox.py --tp 2 --model HuggingFaceH4/mistral-7b-sft-beta --model_path checkpoints/neox_converted/zephyr-sft_tp2
 5 | ```
 6 | 
 7 | 
 8 | # To generate data
 9 | First make a new environment... We want to keep the same data between runs so the easiest way is to create a new conda
10 | environment and follow the steps below.
11 | ```
12 | conda create -n handbook python=3.10 && conda activate handbook
13 | git clone https://github.com/huggingface/alignment-handbook.git
14 | cd ./alignment-handbook/
15 | python -m pip install .
16 | python -m pip install jsonlines
17 | ```
18 | 
19 | ## DPO data
20 | ```bash
21 | # from the gpt-neox repo
22 | conda activate handbook
23 | python post-training/dpo_data.py
24 | conda deactivate
25 | # activate your neox conda environment, or whatever you need to switch to the neox environment
26 | mkdir data
27 | mkdir data/pairwise
28 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
29 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
30 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
31 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
32 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
33 | python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
34 | ```
35 | 
36 | ## Running
37 | ```bash
38 | python deepy.py train.py post-training/configs/benchmarking/mistral-dpo.yml
39 | ```
40 | 


--------------------------------------------------------------------------------
/configs/mamba/mamba-130M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # Parallelism is not yet supported for Mamba
 3 |   "pipe_parallel_size": 0,
 4 |   "model_parallel_size": 1,
 5 | 
 6 |   "num_layers": 24,
 7 |   "hidden_size": 768,
 8 |   "num_attention_heads": 12, # ignored when using mamba
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "output_layer_parallelism": "column",
12 |   "norm": "rmsnorm",
13 |   "rms_norm_epsilon": 1.0e-5,
14 | 
15 |   "attention_config": [[["mamba"], 24]],
16 | 
17 |   "mamba_selective_scan_fusion": true,
18 |   "mamba_causal_conv_fusion": true,
19 |   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 |   "activation": "silu",
21 | 
22 |   # init methods
23 |    "init_method": "small_init",
24 |    "output_layer_init_method": "single_residual_scaled_normal",
25 | 
26 | 
27 |    # optimizer settings
28 |    "optimizer": {
29 |      "type": "Adam",
30 |      "params": {
31 |        "lr": 0.0006,
32 |        "betas": [0.9, 0.95],
33 |        "eps": 1.0e-8,
34 |      }
35 |    },
36 |    "min_lr": 0.00006,
37 | 
38 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
39 |    "zero_optimization": {
40 |     "stage": 1,
41 |     "allgather_partitions": True,
42 |     "allgather_bucket_size": 500000000,
43 |     "overlap_comm": True,
44 |     "reduce_scatter": True,
45 |     "reduce_bucket_size": 500000000,
46 |     "contiguous_gradients": True,
47 |   },
48 | 
49 |    # batch / data settings
50 |    "train_micro_batch_size_per_gpu": 4,
51 |    "data_impl": "mmap",
52 | 
53 |    # activation checkpointing
54 |    "checkpoint_activations": true,
55 |    "checkpoint_num_layers": 1,
56 |    "partition_activations": true,
57 |    "synchronize_each_layer": true,
58 | 
59 |    # regularization
60 |    "gradient_clipping": 1.0,
61 |    "weight_decay": 0.1,
62 |    "hidden_dropout": 0.0,
63 |    "attention_dropout": 0.0,
64 | 
65 |    # precision settings
66 |    "fp16": {
67 |      "enabled": true,
68 |      "loss_scale": 0,
69 |      "loss_scale_window": 1000,
70 |      "hysteresis": 2,
71 |      "min_loss_scale": 1
72 |    },
73 | 
74 |    # misc. training settings
75 |    "train_iters": 320000,
76 |    "lr_decay_iters": 320000,
77 |    "distributed_backend": "nccl",
78 |    "lr_decay_style": "cosine",
79 |    "warmup": 0.01,
80 |    "checkpoint_factor": 10000,
81 |    "eval_interval": 1000,
82 |    "eval_iters": 10,
83 | 
84 |    # logging
85 |    "log_interval": 100,
86 |    "steps_per_print": 10,
87 |    "keep_last_n_checkpoints": 4,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/configs/mamba/mamba-370M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # Parallelism is not yet supported for Mamba
 3 |   "pipe_parallel_size": 0,
 4 |   "model_parallel_size": 1,
 5 | 
 6 |   "num_layers": 48,
 7 |   "hidden_size": 1024,
 8 |   "num_attention_heads": 12, # ignored when using mamba
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "output_layer_parallelism": "column",
12 |   "norm": "rmsnorm",
13 |   "rms_norm_epsilon": 1.0e-5,
14 | 
15 |   "attention_config": [[["mamba"], 48]],
16 | 
17 |   "mamba_selective_scan_fusion": true,
18 |   "mamba_causal_conv_fusion": true,
19 |   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 |   "activation": "silu",
21 | 
22 |   # init methods
23 |    "init_method": "small_init",
24 |    "output_layer_init_method": "single_residual_scaled_normal",
25 | 
26 |    # optimizer settings
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.0003,
31 |        "betas": [0.9, 0.95],
32 |        "eps": 1.0e-8,
33 |      }
34 |    },
35 |    "min_lr": 0.00003,
36 | 
37 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": True,
41 |     "allgather_bucket_size": 500000000,
42 |     "overlap_comm": True,
43 |     "reduce_scatter": True,
44 |     "reduce_bucket_size": 500000000,
45 |     "contiguous_gradients": True,
46 |   },
47 |    # batch / data settings
48 |    "train_micro_batch_size_per_gpu": 4,
49 |    "data_impl": "mmap",
50 | 
51 |    # activation checkpointing
52 |    "checkpoint_activations": true,
53 |    "checkpoint_num_layers": 1,
54 |    "partition_activations": true,
55 |    "synchronize_each_layer": true,
56 | 
57 |    # regularization
58 |    "gradient_clipping": 1.0,
59 |    "weight_decay": 0.1,
60 |    "hidden_dropout": 0,
61 |    "attention_dropout": 0,
62 | 
63 |    # precision settings
64 |    "fp16": {
65 |      "fp16": true,
66 |      "enabled": true,
67 |      "loss_scale": 0,
68 |      "loss_scale_window": 1000,
69 |      "hysteresis": 2,
70 |      "min_loss_scale": 1
71 |    },
72 | 
73 |    # misc. training settings
74 |    "train_iters": 320000,
75 |    "lr_decay_iters": 320000,
76 |    "distributed_backend": "nccl",
77 |    "lr_decay_style": "cosine",
78 |    "warmup": 0.01,
79 |    "checkpoint_factor": 10000,
80 |    "eval_interval": 1000,
81 |    "eval_iters": 10,
82 | 
83 |    # logging
84 |    "log_interval": 100,
85 |    "steps_per_print": 10,
86 |    "keep_last_n_checkpoints": 4,
87 |    "wall_clock_breakdown": true,
88 | }
89 | 


--------------------------------------------------------------------------------
/tools/datasets/multinode_prepare_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # USAGE:
 4 | # This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
 5 | # over the processes.
 6 | # This bash script takes a single text file as input argument.
 7 | # The text file contains a valid filepath in each line, leading to a jsonl-file.
 8 | # Furthermore an environment variable for the rank and the world size needs to be set.
 9 | # These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
10 | # using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
11 | # You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.
12 | 
13 | # Parse command-line arguments
14 | text_file="$1"
15 | rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}"
16 | world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}"
17 | num_lines=$(wc -l < "$text_file")
18 | chunk_size=$((num_lines / world_size))
19 | start_line=$((rank * chunk_size + 1))
20 | end_line=$((start_line + chunk_size - 1))
21 | 
22 | # Make sure the last chunk includes all remaining lines
23 | if [[ $rank == $((world_size - 1)) ]]; then
24 |     end_line=$num_lines
25 | fi
26 | 
27 | # Select the chunk of the text file that corresponds to the rank
28 | chunk_file="chunk_${rank}.txt"
29 | sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file"
30 | 
31 | # Parse additional flags to be passed to the Python script
32 | shift 1  # Shift past the first three arguments
33 | py_args=""
34 | prefix_arg=""
35 | while [[ $# -gt 0 ]]; do
36 |     case "$1" in
37 |         --output-prefix=*) prefix_arg="$1"; shift;;
38 |         --output-prefix) prefix_arg="$1 $2"; shift 2;;
39 |         --*) py_args="$py_args $1 $2"; shift 2;;
40 |         *) echo "Unknown argument: $1"; exit 1;;
41 |     esac
42 | done
43 | 
44 | # Add the rank to the --output-prefix argument if it is set
45 | if [[ -n "$prefix_arg" ]]; then
46 |     py_args="$py_args $prefix_arg$rank"
47 | else
48 |     # Inject a default --output-prefix argument containing the rank
49 |     py_args="$py_args --output-prefix rank${rank}"
50 | fi
51 | 
52 | 
53 | echo "processing $chunk_file with rank $rank at world size $world_size"
54 | echo "using the following args: $py_args"
55 | # Call the Python script with the list of file paths in the chunk
56 | python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args
57 | 
58 | # Clean up
59 | rm "$chunk_file"
60 | 


--------------------------------------------------------------------------------
/configs/mamba/mamba-1.4B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # Parallelism is not yet supported for Mamba
 3 |   "pipe_parallel_size": 0,
 4 |   "model_parallel_size": 1,
 5 | 
 6 |   "num_layers": 48,
 7 |   "hidden_size": 2048,
 8 |   "num_attention_heads": 12, # ignored when using mamba
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "output_layer_parallelism": "column",
12 |   "norm": "rmsnorm",
13 |   "rms_norm_epsilon": 1.0e-5,
14 | 
15 |   "attention_config": [[["mamba"], 48]],
16 | 
17 |   "mamba_selective_scan_fusion": true,
18 |   "mamba_causal_conv_fusion": true,
19 |   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 |   "activation": "silu",
21 | 
22 |    # init methods
23 |    "init_method": "small_init",
24 |    "output_layer_init_method": "single_residual_scaled_normal",
25 | 
26 |    # optimizer settings
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.0002,
31 |        "betas": [0.9, 0.95],
32 |        "eps":  1.0e-8,
33 |      }
34 |    },
35 |    "min_lr": 0.00002,
36 | 
37 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": True,
41 |     "allgather_bucket_size": 500000000,
42 |     "overlap_comm": True,
43 |     "reduce_scatter": True,
44 |     "reduce_bucket_size": 500000000,
45 |     "contiguous_gradients": True,
46 |   },
47 | 
48 |    # batch / data settings
49 |    "train_micro_batch_size_per_gpu": 4,
50 |    "data_impl": "mmap",
51 | 
52 |    # activation checkpointing
53 |    "checkpoint_activations": true,
54 |    "checkpoint_num_layers": 1,
55 |    "partition_activations": true,
56 |    "synchronize_each_layer": true,
57 | 
58 |    # regularization
59 |    "gradient_clipping": 1.0,
60 |    "weight_decay": 0.1,
61 |    "hidden_dropout": 0,
62 |    "attention_dropout": 0,
63 | 
64 |    # precision settings
65 |    "fp16": {
66 |      "fp16": true,
67 |      "enabled": true,
68 |      "loss_scale": 0,
69 |      "loss_scale_window": 1000,
70 |      "hysteresis": 2,
71 |      "min_loss_scale": 1
72 |    },
73 | 
74 |    # misc. training settings
75 |    "train_iters": 320000,
76 |    "lr_decay_iters": 320000,
77 |    "distributed_backend": "nccl",
78 |    "lr_decay_style": "cosine",
79 |    "warmup": 0.01,
80 |    "checkpoint_factor": 10000,
81 |    "eval_interval": 1000,
82 |    "eval_iters": 10,
83 | 
84 |    # logging
85 |    "log_interval": 1,
86 |    "steps_per_print": 10,
87 |    "keep_last_n_checkpoints": 4,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/configs/mamba/mamba-2.8B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # Parallelism is not yet supported for Mamba
 3 |   "pipe_parallel_size": 0,
 4 |   "model_parallel_size": 1,
 5 | 
 6 |   "num_layers": 64,
 7 |   "hidden_size": 2560,
 8 |   "num_attention_heads": 12, # ignored when using mamba
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "output_layer_parallelism": "column",
12 |   "norm": "rmsnorm",
13 |   "rms_norm_epsilon": 1.0e-5,
14 | 
15 |   "attention_config": [[["mamba"], 64]],
16 | 
17 |   "mamba_selective_scan_fusion": true,
18 |   "mamba_causal_conv_fusion": true,
19 |   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 |   "activation": "silu",
21 | 
22 |    # init methods
23 |    "init_method": "small_init",
24 |    "output_layer_init_method": "single_residual_scaled_normal",
25 | 
26 |    # optimizer settings
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.00016,
31 |        "betas": [0.9, 0.95],
32 |        "eps": 1.0e-8,
33 |      }
34 |    },
35 |    "min_lr": 0.000016,
36 | 
37 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": True,
41 |     "allgather_bucket_size": 500000000,
42 |     "overlap_comm": True,
43 |     "reduce_scatter": True,
44 |     "reduce_bucket_size": 500000000,
45 |     "contiguous_gradients": True,
46 |   },
47 | 
48 |    # batch / data settings
49 |    "train_micro_batch_size_per_gpu": 4,
50 |    "data_impl": "mmap",
51 | 
52 |    # activation checkpointing
53 |    "checkpoint_activations": true,
54 |    "checkpoint_num_layers": 1,
55 |    "partition_activations": true,
56 |    "synchronize_each_layer": true,
57 | 
58 |    # regularization
59 |    "gradient_clipping": 1.0,
60 |    "weight_decay": 0.1,
61 |    "hidden_dropout": 0,
62 |    "attention_dropout": 0,
63 | 
64 |    # precision settings
65 |    "fp16": {
66 |      "fp16": true,
67 |      "enabled": true,
68 |      "loss_scale": 0,
69 |      "loss_scale_window": 1000,
70 |      "hysteresis": 2,
71 |      "min_loss_scale": 1
72 |    },
73 | 
74 |    # misc. training settings
75 |    "train_iters": 320000,
76 |    "lr_decay_iters": 320000,
77 |    "distributed_backend": "nccl",
78 |    "lr_decay_style": "cosine",
79 |    "warmup": 0.01,
80 |    "checkpoint_factor": 10000,
81 |    "eval_interval": 1000,
82 |    "eval_iters": 10,
83 | 
84 |    # logging
85 |    "log_interval": 100,
86 |    "steps_per_print": 10,
87 |    "keep_last_n_checkpoints": 4,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/configs/mamba/mamba-790M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # Parallelism is not yet supported for Mamba
 3 |   "pipe_parallel_size": 0,
 4 |   "model_parallel_size": 1,
 5 | 
 6 |   "num_layers": 48,
 7 |   "hidden_size": 1536,
 8 |   "num_attention_heads": 12, # ignored when using mamba
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "output_layer_parallelism": "column",
12 |   "norm": "rmsnorm",
13 |   "rms_norm_epsilon": 1.0e-5,
14 | 
15 |   "attention_config": [[["mamba"], 48]],
16 | 
17 |   "mamba_selective_scan_fusion": true,
18 |   "mamba_causal_conv_fusion": true,
19 |   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
20 |   "activation": "silu",
21 | 
22 |   # init methods
23 |    "init_method": "small_init",
24 |    "output_layer_init_method": "single_residual_scaled_normal",
25 | 
26 |    # optimizer settings
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.00025,
31 |        "betas": [0.9, 0.999],
32 |        "eps": 1.0e-8,
33 |      }
34 |    },
35 |    "min_lr": 0.000025,
36 | 
37 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": True,
41 |     "allgather_bucket_size": 500000000,
42 |     "overlap_comm": True,
43 |     "reduce_scatter": True,
44 |     "reduce_bucket_size": 500000000,
45 |     "contiguous_gradients": True,
46 |   },
47 | 
48 |    # batch / data settings
49 |    "train_micro_batch_size_per_gpu": 4,
50 |    "data_impl": "mmap",
51 | 
52 |    # activation checkpointing
53 |    "checkpoint_activations": true,
54 |    "checkpoint_num_layers": 1,
55 |    "partition_activations": true,
56 |    "synchronize_each_layer": true,
57 | 
58 |    # regularization
59 |    "gradient_clipping": 1.0,
60 |    "weight_decay": 0.1,
61 |    "hidden_dropout": 0,
62 |    "attention_dropout": 0,
63 | 
64 |    # precision settings
65 |    "fp16": {
66 |      "fp16": true,
67 |      "enabled": true,
68 |      "loss_scale": 0,
69 |      "loss_scale_window": 1000,
70 |      "hysteresis": 2,
71 |      "min_loss_scale": 1
72 |    },
73 | 
74 |    # misc. training settings
75 |    "train_iters": 320000,
76 |    "lr_decay_iters": 320000,
77 |    "distributed_backend": "nccl",
78 |    "lr_decay_style": "cosine",
79 |    "warmup": 0.01,
80 |    "checkpoint_factor": 10000,
81 |    "eval_interval": 1000,
82 |    "eval_iters": 10,
83 | 
84 |    # logging
85 |    "log_interval": 100,
86 |    "steps_per_print": 10,
87 |    "keep_last_n_checkpoints": 4,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/configs/49M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # parallelism settings
 3 |   "pipe_parallel_size": 1,
 4 |   "model_parallel_size": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 10,
 8 |   "hidden_size": 640,
 9 |   "num_attention_heads": 10,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 0.25,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": true,
16 |   "output_layer_parallelism": "column",
17 | 
18 |   # these should provide some speedup but takes a while to build, set to true if desired
19 |   "scaled_upper_triang_masked_softmax_fusion": false,
20 |   "bias_gelu_fusion": false,
21 |   "rope_fusion": false,
22 |   "layernorm_fusion": false,
23 | 
24 |   # init methods
25 |   "init_method": "small_init",
26 |   "output_layer_init_method": "wang_init",
27 | 
28 |   # optimizer settings
29 |   "optimizer": {
30 |     "type": "Adam",
31 |     "params": {
32 |       "lr": 0.0008,
33 |       "betas": [0.9, 0.95],
34 |       "eps": 1.0e-8,
35 |     }
36 |   },
37 |   "min_lr": 0.00008,
38 | 
39 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 |   "zero_optimization": {
41 |     "stage": 1,
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |   # batch / data settings
51 |   "train_micro_batch_size_per_gpu": 32,
52 |   "gradient_accumulation_steps": 1,
53 |   "data_impl": "mmap",
54 |   "num_workers": 1,
55 | 
56 |   # activation checkpointing
57 |   "checkpoint_activations": true,
58 |   "checkpoint_num_layers": 1,
59 |   "partition_activations": true,
60 |   "synchronize_each_layer": true,
61 | 
62 |   # regularization
63 |   "gradient_clipping": 1.0,
64 |   "weight_decay": 0.1,
65 |   "hidden_dropout": 0,
66 |   "attention_dropout": 0,
67 | 
68 |   # precision settings
69 |   "fp16": {
70 |     "fp16": true,
71 |     "enabled": true,
72 |     "loss_scale": 0,
73 |     "loss_scale_window": 1000,
74 |     "initial_scale_power": 12,
75 |     "hysteresis": 2,
76 |     "min_loss_scale": 1,
77 |   },
78 | 
79 |   # misc. training settings
80 |   "train_iters": 143000,
81 |   "lr_decay_iters": 143000,
82 |   "distributed_backend": "nccl",
83 |   "lr_decay_style": "cosine",
84 |   "warmup": 0.01,
85 |   "checkpoint_factor": 1000,
86 |   "eval_interval": 100000,
87 |   "eval_iters": 10,
88 | 
89 |   # logging
90 |   "log_interval": 10,
91 |   "steps_per_print": 10,
92 |   "wall_clock_breakdown": true,
93 | }
94 | 


--------------------------------------------------------------------------------
/configs/bnb_125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 12,
10 |    "hidden_size": 768,
11 |    "num_attention_heads": 12,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "use_bnb_optimizer": true,
18 | 
19 |    # these should provide some speedup but takes a while to build, set to true if desired
20 |    "scaled_upper_triang_masked_softmax_fusion": false,
21 |    "bias_gelu_fusion": false,
22 |    "rope_fusion": false,
23 |    "layernorm_fusion": false,
24 | 
25 | 
26 |    # optimizer settings
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.0006,
31 |        "betas": [0.9, 0.999],
32 |        "eps": 1.0e-8,
33 |      }
34 |    },
35 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36 |    "zero_optimization": {
37 |     "stage": 0,
38 |     "allgather_partitions": True,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": True,
41 |     "reduce_scatter": True,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": True,
44 |   },
45 | 
46 |    # batch / data settings
47 |    "train_micro_batch_size_per_gpu": 4,
48 |    "data_impl": "mmap",
49 |    "split": "949,50,1",
50 | 
51 |    # activation checkpointing
52 |    "checkpoint_activations": true,
53 |    "checkpoint_num_layers": 1,
54 |    "partition_activations": true,
55 |    "synchronize_each_layer": true,
56 | 
57 |    # regularization
58 |    "gradient_clipping": 1.0,
59 |    "weight_decay": 0.0,
60 |    "hidden_dropout": 0.0,
61 |    "attention_dropout": 0.0,
62 | 
63 |    # precision settings
64 |    "fp16": {
65 |      "enabled": true,
66 |      "loss_scale": 0,
67 |      "loss_scale_window": 1000,
68 |      "hysteresis": 2,
69 |      "min_loss_scale": 1
70 |    },
71 | 
72 |    # misc. training settings
73 |    "train_iters": 320000,
74 |    "lr_decay_iters": 320000,
75 |    "distributed_backend": "nccl",
76 |    "lr_decay_style": "cosine",
77 |    "warmup": 0.01,
78 |    "checkpoint_factor": 10000,
79 |    "eval_interval": 1000,
80 |    "eval_iters": 10,
81 | 
82 |    # logging
83 |    "log_interval": 100,
84 |    "steps_per_print": 10,
85 |    "keep_last_n_checkpoints": 4,
86 |    "wall_clock_breakdown": true,
87 | }
88 | 


--------------------------------------------------------------------------------
/configs/19M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   # model settings
 6 |   "num_layers": 6,
 7 |   "hidden_size": 512,
 8 |   "num_attention_heads": 8,
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "pos_emb": "rotary",
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": false,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": false,
17 |   "bias_gelu_fusion": false,
18 |   "rope_fusion": false,
19 |   "layernorm_fusion": false,
20 | 
21 |   # init methods
22 |   "init_method": "small_init",
23 |   "output_layer_init_method": "wang_init",
24 | 
25 |   "optimizer": {
26 |     "type": "Adam",
27 |     "params": {
28 |       "lr": 0.001,
29 |       "betas": [0.9, 0.95],
30 |       "eps": 1.0e-8,
31 |     }
32 |   },
33 |   "min_lr": 0.0001,
34 | 
35 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": True,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": True,
41 |     "reduce_scatter": True,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": True,
44 |   },
45 | 
46 |   "train_micro_batch_size_per_gpu": 4, #32,
47 |   "gradient_accumulation_steps": 1,
48 |   "data_impl": "mmap",
49 |   "num_workers": 1,
50 | 
51 |   # activation checkpointing
52 |   "checkpoint_activations": true,
53 |   "checkpoint_num_layers": 1,
54 |   "partition_activations": true,
55 |   "synchronize_each_layer": true,
56 | 
57 |   # regularization
58 |   "gradient_clipping": 1.0,
59 |   "weight_decay": 0.1,
60 |   "hidden_dropout": 0,
61 |   "attention_dropout": 0,
62 | 
63 |   # precision settings
64 |   "fp16": {
65 |     "fp16": true,
66 |     "enabled": true,
67 |     "loss_scale": 0,
68 |     "loss_scale_window": 1000,
69 |     "initial_scale_power": 12,
70 |     "hysteresis": 2,
71 |     "min_loss_scale": 1,
72 |   },
73 | 
74 |   "train_iters": 143000,
75 |   "lr_decay_iters": 143000,
76 |   "distributed_backend": "nccl",
77 |   "lr_decay_style": "cosine",
78 |   "warmup": 0.01,
79 |   "checkpoint_factor": 1000,
80 |   "eval_interval": 100000,
81 |   "eval_iters": 10,
82 | 
83 |   "log_interval": 10,
84 |   "steps_per_print": 10,
85 |   "wall_clock_breakdown": true,
86 | 
87 |   # additional deepspeed args not specified above
88 |   "deepspeed_extra_args": {
89 |     "comms_logger": {
90 |         "enabled": true,
91 |         "verbose": true,
92 |         "prof_all": true,
93 |         "debug": false
94 |     },
95 |   }
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from tools.datasets.corpora import prepare_dataset, DATA_DOWNLOADERS
16 | import argparse
17 | 
18 | TOKENIZER_CHOICES = [
19 |     "HFGPT2Tokenizer",
20 |     "HFTokenizer",
21 |     "GPT2BPETokenizer",
22 |     "CharLevelTokenizer",
23 |     "TiktokenTokenizer",
24 |     "SPMTokenizer",
25 | ]
26 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"]
27 | 
28 | 
29 | def get_args():
30 |     parser = argparse.ArgumentParser(description="Download & preprocess neox datasets")
31 |     parser.add_argument(
32 |         "dataset",
33 |         nargs="?",
34 |         default="enwik8",
35 |         help="name of dataset to download.",
36 |         choices=DATASET_CHOICES,
37 |     )
38 |     parser.add_argument(
39 |         "-t",
40 |         "--tokenizer",
41 |         default="GPT2BPETokenizer",
42 |         choices=TOKENIZER_CHOICES,
43 |         help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}',
44 |     )
45 |     parser.add_argument(
46 |         "-d",
47 |         "--data-dir",
48 |         default=None,
49 |         help=f"Directory to which to download datasets / tokenizer "
50 |         f"files - defaults to ./data",
51 |     )
52 |     parser.add_argument(
53 |         "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)"
54 |     )
55 |     parser.add_argument(
56 |         "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)"
57 |     )
58 |     parser.add_argument(
59 |         "-f",
60 |         "--force-redownload",
61 |         dest="force_redownload",
62 |         default=False,
63 |         action="store_true",
64 |     )
65 |     return parser.parse_args()
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     args = get_args()
70 |     prepare_dataset(
71 |         dataset_name=args.dataset,
72 |         tokenizer_type=args.tokenizer,
73 |         data_dir=args.data_dir,
74 |         vocab_file=args.vocab_file,
75 |         merge_file=args.merge_file,
76 |         force_redownload=args.force_redownload,
77 |     )
78 | 


--------------------------------------------------------------------------------
/tools/datasets/merge_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import argparse
 5 | 
 6 | sys.path.append(
 7 |     os.path.abspath(
 8 |         os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
 9 |     )
10 | )
11 | 
12 | from megatron.data import indexed_dataset
13 | 
14 | 
15 | def main(args):
16 | 
17 |     prefixes = set()
18 |     for basename in os.listdir(args.input):
19 |         prefix, ext = os.path.splitext(basename)
20 | 
21 |         if prefix in prefixes:
22 |             continue
23 | 
24 |         if not os.path.isfile(os.path.join(args.input, basename)):
25 |             continue
26 | 
27 |         ext_pair = ".bin" if ext == ".idx" else ".idx"
28 |         assert os.path.isfile(
29 |             os.path.join(args.input, prefix) + ext_pair
30 |         ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
31 | 
32 |         prefixes.add(prefix)
33 | 
34 |     builder = None
35 |     for prefix in sorted(prefixes):
36 |         if builder is None:
37 |             dataset = indexed_dataset.make_dataset(
38 |                 os.path.join(args.input, prefix), "infer"
39 |             )
40 | 
41 |             if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
42 |                 builder = indexed_dataset.MMapIndexedDatasetBuilder(
43 |                     args.output_prefix + ".bin", dtype=dataset._index.dtype
44 |                 )
45 |             else:
46 |                 builder = indexed_dataset.IndexedDatasetBuilder(
47 |                     args.output_prefix + ".bin"
48 |                 )
49 | 
50 |             del dataset
51 | 
52 |         builder.merge_file_(os.path.join(args.input, prefix))
53 | 
54 |     builder.finalize(args.output_prefix + ".idx")
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser()
59 | 
60 |     group = parser.add_argument_group(title="input data")
61 |     group.add_argument(
62 |         "--input",
63 |         type=str,
64 |         required=True,
65 |         help="Path to directory containing all document files to merge",
66 |     )
67 | 
68 |     group = parser.add_argument_group(title="output data")
69 |     group.add_argument(
70 |         "--output-prefix",
71 |         type=str,
72 |         required=True,
73 |         help="Path to binary output file without suffix",
74 |     )
75 | 
76 |     args = parser.parse_args()
77 | 
78 |     assert os.path.isdir(
79 |         args.input
80 |     ), f"ERROR: {args.input} is not a directory or does not exist"
81 | 
82 |     assert os.path.isdir(
83 |         os.path.dirname(args.output_prefix)
84 |     ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
85 | 
86 |     main(args)
87 | 


--------------------------------------------------------------------------------
/configs/pythia/31M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # parallelism settings
 3 |   "pipe-parallel-size": 0,
 4 |   "model-parallel-size": 1,
 5 | 
 6 |   # model settings
 7 |   "num-layers": 6,
 8 |   "hidden-size": 256,
 9 |   "num-attention-heads": 8,
10 |   "seq-length": 2048,
11 |   "max-position-embeddings": 2048,
12 |   "pos-emb": "rotary",
13 |   "rotary-pct": 0.25,
14 |   "no-weight-tying": true,
15 |   "gpt-j-residual": true,
16 |   "output-layer-parallelism": "column",
17 | 
18 |   "attention-config": [[["flash"], 6]],
19 | 
20 |   "scaled-upper-triang-masked-softmax-fusion": true,
21 |   "bias-gelu-fusion": true,
22 | 
23 |   # init methods
24 |   "init_method": "small_init",
25 |   "output_layer_init_method": "wang_init",
26 | 
27 |   "optimizer": {
28 |     "type": "Adam",
29 |     "params": {
30 |       "lr": 0.001,
31 |       "betas": [0.9, 0.95],
32 |       "eps": 1.0e-8
33 |     }
34 |   },
35 |   "min_lr": 0.0001,
36 | 
37 |   "zero_optimization": {
38 |     "stage": 0,
39 |     "allgather_partitions": true,
40 |     "allgather_bucket_size": 500000000,
41 |     "overlap_comm": true,
42 |     "reduce_scatter": true,
43 |     "reduce_bucket_size": 500000000,
44 |     "contiguous_gradients": true,
45 |     "cpu_offload": false
46 |   },
47 | 
48 |   # batch size (trained on 32 gpus)
49 |   "train_micro_batch_size_per_gpu": 32,
50 |   "data-impl": "mmap",
51 |   "num_workers": 2,
52 | 
53 |   # activation checkpointing
54 |   "checkpoint-activations": false,
55 |   "checkpoint-num-layers": 1,
56 |   "partition-activations": false,
57 |   "synchronize-each-layer": true,
58 | 
59 |   # regularization
60 |   "gradient_clipping": 1.0,
61 |   "weight-decay": 0.1,
62 |   "hidden-dropout": 0,
63 |   "attention-dropout": 0,
64 | 
65 |   # precision settings
66 |   "fp16": {
67 |     "fp16": true,
68 |     "enabled": true,
69 |     "loss_scale": 0,
70 |     "loss_scale_window": 1000,
71 |     "initial_scale_power": 12,
72 |     "hysteresis": 2,
73 |     "min_loss_scale": 1
74 |   },
75 | 
76 |   "train-iters": 143000,
77 |   "lr-decay-iters": 143000,
78 |   "distributed-backend": "nccl",
79 |   "lr-decay-style": "cosine",
80 |   "warmup": 0.01,
81 |   "checkpoint-factor": 1000,
82 |   "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
83 |   "eval-interval": 100000,
84 |   "eval-iters": 10,
85 |   "log-interval": 10,
86 |   "steps_per_print": 10,
87 |   "wall_clock_breakdown": true,
88 | 
89 |   "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
90 |   "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
91 |   "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
92 | 
93 |   "tokenizer-type": "HFTokenizer",
94 |   "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json"
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/configs/pythia/14M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # parallelism settings
 3 |   "pipe-parallel-size": 0,
 4 |   "model-parallel-size": 1,
 5 | 
 6 |   # model settings
 7 |   "num-layers": 6,
 8 |   "hidden-size": 128,
 9 |   "num-attention-heads": 4,
10 |   "seq-length": 2048,
11 |   "max-position-embeddings": 2048,
12 |   "pos-emb": "rotary",
13 |   "rotary-pct": 0.25,
14 |   "no-weight-tying": true,
15 |   "gpt-j-residual": true,
16 |   "output-layer-parallelism": "column",
17 | 
18 |   "attention-config": [[["flash"], 6]],
19 | 
20 |   "scaled-upper-triang-masked-softmax-fusion": true,
21 |   "bias-gelu-fusion": true,
22 | 
23 |   # init methods
24 |   "init_method": "small_init",
25 |   "output_layer_init_method": "wang_init",
26 | 
27 |   "optimizer": {
28 |     "type": "Adam",
29 |     "params": {
30 |       "lr": 0.001,
31 |       "betas": [0.9, 0.95],
32 |       "eps": 1.0e-8
33 |     }
34 |   },
35 |   "min_lr": 0.0001,
36 | 
37 |   "zero_optimization": {
38 |     "stage": 0,
39 |     "allgather_partitions": true,
40 |     "allgather_bucket_size": 50000000,
41 |     "overlap_comm": true,
42 |     "reduce_scatter": true,
43 |     "reduce_bucket_size": 50000000,
44 |     "contiguous_gradients": true,
45 |     "cpu_offload": false
46 |   },
47 | 
48 |   # batch size (trained on 32 gpus)
49 |   "train_micro_batch_size_per_gpu": 32,
50 |   "data-impl": "mmap",
51 |   "num_workers": 4,
52 | 
53 |   # activation checkpointing
54 |   "checkpoint-activations": false, #true,
55 |   "checkpoint-num-layers": 1,
56 |   "partition-activations": false, #true,
57 |   "synchronize-each-layer": true,
58 | 
59 |   # regularization
60 |   "gradient_clipping": 1.0,
61 |   "weight-decay": 0.1,
62 |   "hidden-dropout": 0,
63 |   "attention-dropout": 0,
64 | 
65 |   # precision settings
66 |   "fp16": {
67 |     "fp16": true,
68 |     "enabled": true,
69 |     "loss_scale": 0,
70 |     "loss_scale_window": 1000,
71 |     "initial_scale_power": 12,
72 |     "hysteresis": 2,
73 |     "min_loss_scale": 1
74 |   },
75 | 
76 |   "train-iters": 143000,
77 |   "lr-decay-iters": 143000,
78 |   "distributed-backend": "nccl",
79 |   "lr-decay-style": "cosine",
80 |   "warmup": 0.01,
81 |   "checkpoint-factor": 1000,
82 |   "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
83 |   "eval-interval": 100000,
84 |   "eval-iters": 10,
85 | 
86 |   "log-interval": 10,
87 |   "steps_per_print": 10,
88 |   "wall_clock_breakdown": true,
89 | 
90 |   "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
91 |   "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
92 |   "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
93 | 
94 |   "tokenizer-type": "HFTokenizer",
95 |   "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json"
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/configs/175B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 96,
10 |    "hidden_size": 12288,
11 |    "num_attention_heads": 96,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 |    # optimizer settings
31 |    "optimizer": {
32 |      "type": "Adam",
33 |      "params": {
34 |        "lr": 0.00006,
35 |        "betas": [0.9, 0.95],
36 |        "eps": 1.0e-8,
37 |      }
38 |    },
39 |    "min_lr": 0.000006,
40 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 |    "zero_optimization": {
42 |     "stage": 1,
43 |     "allgather_partitions": True,
44 |     "allgather_bucket_size": 500000000,
45 |     "overlap_comm": True,
46 |     "reduce_scatter": True,
47 |     "reduce_bucket_size": 500000000,
48 |     "contiguous_gradients": True,
49 |   },
50 | 
51 |    # batch / data settings
52 |    "train_micro_batch_size_per_gpu": 4,
53 |    "data_impl": "mmap",
54 | 
55 |    # activation checkpointing
56 |    "checkpoint_activations": true,
57 |    "checkpoint_num_layers": 1,
58 |    "partition_activations": true,
59 |    "synchronize_each_layer": true,
60 | 
61 |    # regularization
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    # precision settings
68 |    "fp16": {
69 |      "fp16": true,
70 |      "enabled": true,
71 |      "loss_scale": 0,
72 |      "loss_scale_window": 1000,
73 |      "hysteresis": 2,
74 |      "min_loss_scale": 1
75 |    },
76 | 
77 |    # misc. training settings
78 |    "train_iters": 320000,
79 |    "lr_decay_iters": 320000,
80 |    "distributed_backend": "nccl",
81 |    "lr_decay_style": "cosine",
82 |    "warmup": 0.01,
83 |    "checkpoint_factor": 10000,
84 |    "eval_interval": 1000,
85 |    "eval_iters": 10,
86 | 
87 |    # logging
88 |    "log_interval": 100,
89 |    "steps_per_print": 10,
90 |    "keep_last_n_checkpoints": 4,
91 |    "wall_clock_breakdown": true,
92 | }
93 | 


--------------------------------------------------------------------------------
/configs/350M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 24,
10 |    "hidden_size": 1024,
11 |    "num_attention_heads": 16,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 |    # optimizer settings
31 |    "optimizer": {
32 |      "type": "Adam",
33 |      "params": {
34 |        "lr": 0.0003,
35 |        "betas": [0.9, 0.95],
36 |        "eps": 1.0e-8,
37 |      }
38 |    },
39 |    "min_lr": 0.00003,
40 | 
41 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 |    "zero_optimization": {
43 |     "stage": 1,
44 |     "allgather_partitions": True,
45 |     "allgather_bucket_size": 500000000,
46 |     "overlap_comm": True,
47 |     "reduce_scatter": True,
48 |     "reduce_bucket_size": 500000000,
49 |     "contiguous_gradients": True,
50 |   },
51 |    # batch / data settings
52 |    "train_micro_batch_size_per_gpu": 4,
53 |    "data_impl": "mmap",
54 | 
55 |    # activation checkpointing
56 |    "checkpoint_activations": true,
57 |    "checkpoint_num_layers": 1,
58 |    "partition_activations": true,
59 |    "synchronize_each_layer": true,
60 | 
61 |    # regularization
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    # precision settings
68 |    "fp16": {
69 |      "fp16": true,
70 |      "enabled": true,
71 |      "loss_scale": 0,
72 |      "loss_scale_window": 1000,
73 |      "hysteresis": 2,
74 |      "min_loss_scale": 1
75 |    },
76 | 
77 |    # misc. training settings
78 |    "train_iters": 320000,
79 |    "lr_decay_iters": 320000,
80 |    "distributed_backend": "nccl",
81 |    "lr_decay_style": "cosine",
82 |    "warmup": 0.01,
83 |    "checkpoint_factor": 10000,
84 |    "eval_interval": 1000,
85 |    "eval_iters": 10,
86 | 
87 |    # logging
88 |    "log_interval": 100,
89 |    "steps_per_print": 10,
90 |    "keep_last_n_checkpoints": 4,
91 |    "wall_clock_breakdown": true,
92 | }
93 | 


--------------------------------------------------------------------------------
/configs/1-3B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 24,
10 |    "hidden_size": 2048,
11 |    "num_attention_heads": 16,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 |    # optimizer settings
31 |    "optimizer": {
32 |      "type": "Adam",
33 |      "params": {
34 |        "lr": 0.0002,
35 |        "betas": [0.9, 0.95],
36 |        "eps":  1.0e-8,
37 |      }
38 |    },
39 |    "min_lr": 0.00002,
40 | 
41 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 |    "zero_optimization": {
43 |     "stage": 1,
44 |     "allgather_partitions": True,
45 |     "allgather_bucket_size": 500000000,
46 |     "overlap_comm": True,
47 |     "reduce_scatter": True,
48 |     "reduce_bucket_size": 500000000,
49 |     "contiguous_gradients": True,
50 |   },
51 | 
52 |    # batch / data settings
53 |    "train_micro_batch_size_per_gpu": 4,
54 |    "data_impl": "mmap",
55 | 
56 |    # activation checkpointing
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    # regularization
63 |    "gradient_clipping": 1.0,
64 |    "weight_decay": 0.1,
65 |    "hidden_dropout": 0,
66 |    "attention_dropout": 0,
67 | 
68 |    # precision settings
69 |    "fp16": {
70 |      "fp16": true,
71 |      "enabled": true,
72 |      "loss_scale": 0,
73 |      "loss_scale_window": 1000,
74 |      "hysteresis": 2,
75 |      "min_loss_scale": 1
76 |    },
77 | 
78 |    # misc. training settings
79 |    "train_iters": 320000,
80 |    "lr_decay_iters": 320000,
81 |    "distributed_backend": "nccl",
82 |    "lr_decay_style": "cosine",
83 |    "warmup": 0.01,
84 |    "checkpoint_factor": 10000,
85 |    "eval_interval": 1000,
86 |    "eval_iters": 10,
87 | 
88 |    # logging
89 |    "log_interval": 100,
90 |    "steps_per_print": 10,
91 |    "keep_last_n_checkpoints": 4,
92 |    "wall_clock_breakdown": true,
93 | }
94 | 


--------------------------------------------------------------------------------
/configs/2-7B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 32,
10 |    "hidden_size": 2560,
11 |    "num_attention_heads": 32,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 |    # optimizer settings
31 |    "optimizer": {
32 |      "type": "Adam",
33 |      "params": {
34 |        "lr": 0.00016,
35 |        "betas": [0.9, 0.95],
36 |        "eps": 1.0e-8,
37 |      }
38 |    },
39 |    "min_lr": 0.000016,
40 | 
41 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 |    "zero_optimization": {
43 |     "stage": 1,
44 |     "allgather_partitions": True,
45 |     "allgather_bucket_size": 500000000,
46 |     "overlap_comm": True,
47 |     "reduce_scatter": True,
48 |     "reduce_bucket_size": 500000000,
49 |     "contiguous_gradients": True,
50 |   },
51 | 
52 |    # batch / data settings
53 |    "train_micro_batch_size_per_gpu": 4,
54 |    "data_impl": "mmap",
55 | 
56 |    # activation checkpointing
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    # regularization
63 |    "gradient_clipping": 1.0,
64 |    "weight_decay": 0.1,
65 |    "hidden_dropout": 0,
66 |    "attention_dropout": 0,
67 | 
68 |    # precision settings
69 |    "fp16": {
70 |      "fp16": true,
71 |      "enabled": true,
72 |      "loss_scale": 0,
73 |      "loss_scale_window": 1000,
74 |      "hysteresis": 2,
75 |      "min_loss_scale": 1
76 |    },
77 | 
78 |    # misc. training settings
79 |    "train_iters": 320000,
80 |    "lr_decay_iters": 320000,
81 |    "distributed_backend": "nccl",
82 |    "lr_decay_style": "cosine",
83 |    "warmup": 0.01,
84 |    "checkpoint_factor": 10000,
85 |    "eval_interval": 1000,
86 |    "eval_iters": 10,
87 | 
88 |    # logging
89 |    "log_interval": 100,
90 |    "steps_per_print": 10,
91 |    "keep_last_n_checkpoints": 4,
92 |    "wall_clock_breakdown": true,
93 | }
94 | 


--------------------------------------------------------------------------------
/configs/6-7B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 32,
10 |    "hidden_size": 4096,
11 |    "num_attention_heads": 32,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 |    # optimizer settings
31 |    "optimizer": {
32 |      "type": "Adam",
33 |      "params": {
34 |        "lr": 0.00012,
35 |        "betas": [0.9, 0.95],
36 |        "eps": 1.0e-8,
37 |      }
38 |    },
39 | 
40 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 |    "zero_optimization": {
42 |     "stage": 1,
43 |     "allgather_partitions": True,
44 |     "allgather_bucket_size": 500000000,
45 |     "overlap_comm": True,
46 |     "reduce_scatter": True,
47 |     "reduce_bucket_size": 500000000,
48 |     "contiguous_gradients": True,
49 |   },
50 |    "min_lr": 0.000012,
51 | 
52 |    # batch / data settings
53 |    "train_micro_batch_size_per_gpu": 4,
54 |    "data_impl": "mmap",
55 | 
56 |    # activation checkpointing
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    # regularization
63 |    "gradient_clipping": 1.0,
64 |    "weight_decay": 0.1,
65 |    "hidden_dropout": 0,
66 |    "attention_dropout": 0,
67 | 
68 |    # precision settings
69 |    "fp16": {
70 |      "fp16": true,
71 |      "enabled": true,
72 |      "loss_scale": 0,
73 |      "loss_scale_window": 1000,
74 |      "hysteresis": 2,
75 |      "min_loss_scale": 1
76 |    },
77 | 
78 |    # misc. training settings
79 |    "train_iters": 320000,
80 |    "lr_decay_iters": 320000,
81 |    "distributed_backend": "nccl",
82 |    "lr_decay_style": "cosine",
83 |    "warmup": 0.01,
84 |    "checkpoint_factor": 10000,
85 |    "eval_interval": 1000,
86 |    "eval_iters": 10,
87 | 
88 |    # logging
89 |    "log_interval": 100,
90 |    "steps_per_print": 10,
91 |    "keep_last_n_checkpoints": 4,
92 |    "wall_clock_breakdown": true,
93 | }
94 | 


--------------------------------------------------------------------------------
/configs/13B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 40,
10 |    "hidden_size": 5120,
11 |    "num_attention_heads": 40,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 | 
31 |    # optimizer settings
32 |    "optimizer": {
33 |      "type": "Adam",
34 |      "params": {
35 |        "lr": 0.0001,
36 |        "betas": [0.9, 0.95],
37 |        "eps": 1.0e-8,
38 |      }
39 |    },
40 | 
41 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 |    "zero_optimization": {
43 |     "stage": 1,
44 |     "allgather_partitions": True,
45 |     "allgather_bucket_size": 500000000,
46 |     "overlap_comm": True,
47 |     "reduce_scatter": True,
48 |     "reduce_bucket_size": 500000000,
49 |     "contiguous_gradients": True,
50 |   },
51 |    "min_lr": 0.00001,
52 | 
53 |    # batch / data settings
54 |    "train_micro_batch_size_per_gpu": 4,
55 |    "data_impl": "mmap",
56 | 
57 |    # activation checkpointing
58 |    "checkpoint_activations": true,
59 |    "checkpoint_num_layers": 1,
60 |    "partition_activations": true,
61 |    "synchronize_each_layer": true,
62 | 
63 |    # regularization
64 |    "gradient_clipping": 1.0,
65 |    "weight_decay": 0.1,
66 |    "hidden_dropout": 0,
67 |    "attention_dropout": 0,
68 | 
69 |    # precision settings
70 |    "fp16": {
71 |      "fp16": true,
72 |      "enabled": true,
73 |      "loss_scale": 0,
74 |      "loss_scale_window": 1000,
75 |      "hysteresis": 2,
76 |      "min_loss_scale": 1
77 |    },
78 | 
79 |    # misc. training settings
80 |    "train_iters": 320000,
81 |    "lr_decay_iters": 320000,
82 |    "distributed_backend": "nccl",
83 |    "lr_decay_style": "cosine",
84 |    "warmup": 0.01,
85 |    "checkpoint_factor": 10000,
86 |    "eval_interval": 1000,
87 |    "eval_iters": 10,
88 | 
89 |    # logging
90 |    "log_interval": 100,
91 |    "steps_per_print": 10,
92 |    "keep_last_n_checkpoints": 4,
93 |    "wall_clock_breakdown": true,
94 | }
95 | 


--------------------------------------------------------------------------------
/configs/760M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 24,
10 |    "hidden_size": 1536,
11 |    "num_attention_heads": 16,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 |    # optimizer settings
31 |    "optimizer": {
32 |      "type": "Adam",
33 |      "params": {
34 |        "lr": 0.00025,
35 |        "betas": [0.9, 0.999],
36 |        "eps": 1.0e-8,
37 |      }
38 |    },
39 |    "min_lr": 0.000025,
40 | 
41 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42 |    "zero_optimization": {
43 |     "stage": 1,
44 |     "allgather_partitions": True,
45 |     "allgather_bucket_size": 500000000,
46 |     "overlap_comm": True,
47 |     "reduce_scatter": True,
48 |     "reduce_bucket_size": 500000000,
49 |     "contiguous_gradients": True,
50 |   },
51 | 
52 |    # batch / data settings
53 |    "train_micro_batch_size_per_gpu": 4,
54 |    "data_impl": "mmap",
55 | 
56 |    # activation checkpointing
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    # regularization
63 |    "gradient_clipping": 1.0,
64 |    "weight_decay": 0.1,
65 |    "hidden_dropout": 0,
66 |    "attention_dropout": 0,
67 | 
68 |    # precision settings
69 |    "fp16": {
70 |      "fp16": true,
71 |      "enabled": true,
72 |      "loss_scale": 0,
73 |      "loss_scale_window": 1000,
74 |      "hysteresis": 2,
75 |      "min_loss_scale": 1
76 |    },
77 | 
78 |    # misc. training settings
79 |    "train_iters": 320000,
80 |    "lr_decay_iters": 320000,
81 |    "distributed_backend": "nccl",
82 |    "lr_decay_style": "cosine",
83 |    "warmup": 0.01,
84 |    "checkpoint_factor": 10000,
85 |    "eval_interval": 1000,
86 |    "eval_iters": 10,
87 | 
88 |    # logging
89 |    "log_interval": 100,
90 |    "steps_per_print": 10,
91 |    "keep_last_n_checkpoints": 4,
92 |    "wall_clock_breakdown": true,
93 | }
94 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Model parallel utility interface."""
16 | 
17 | from .cross_entropy import vocab_parallel_cross_entropy
18 | 
19 | from .data import broadcast_data
20 | 
21 | from .initialize import is_unitialized
22 | from .initialize import destroy_model_parallel
23 | from .initialize import get_data_parallel_group
24 | from .initialize import get_data_parallel_rank
25 | from .initialize import get_data_parallel_world_size
26 | from .initialize import get_model_parallel_group
27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
30 | from .initialize import get_topology
31 | from .initialize import get_pipe_parallel_group
32 | from .initialize import get_pipe_parallel_rank
33 | from .initialize import get_pipe_parallel_world_size
34 | from .initialize import get_tensor_model_parallel_group
35 | from .initialize import get_tensor_model_parallel_rank
36 | from .initialize import get_tensor_model_parallel_world_size
37 | from .initialize import get_io_parallel_group
38 | from .initialize import initialize_model_parallel
39 | from .initialize import model_parallel_is_initialized
40 | 
41 | from .layers import ColumnParallelLinear
42 | from .layers import RowParallelLinear
43 | from .layers import VocabParallelEmbedding
44 | from .layers import ParallelRelativePositionBias
45 | 
46 | from .mappings import copy_to_model_parallel_region
47 | from .mappings import gather_from_model_parallel_region
48 | from .mappings import reduce_from_model_parallel_region
49 | from .mappings import scatter_to_model_parallel_region
50 | from .mappings import reduce_scatter_to_sequence_parallel_region
51 | from .mappings import gather_from_sequence_parallel_region
52 | from .mappings import scatter_to_sequence_parallel_region
53 | 
54 | from .random import checkpoint
55 | from .random import get_cuda_rng_tracker
56 | from .random import model_parallel_cuda_manual_seed
57 | 
58 | from .utils import divide
59 | from .utils import split_tensor_along_last_dim
60 | 


--------------------------------------------------------------------------------
/configs/125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 12,
10 |    "hidden_size": 768,
11 |    "num_attention_heads": 12,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled_upper_triang_masked_softmax_fusion": false,
22 |    "bias_gelu_fusion": false,
23 |    "rope_fusion": false,
24 |    "layernorm_fusion": false,
25 | 
26 |    # init methods
27 |    "init_method": "small_init",
28 |    "output_layer_init_method": "wang_init",
29 | 
30 | 
31 |    # optimizer settings
32 |    "optimizer": {
33 |      "type": "Adam",
34 |      "params": {
35 |        "lr": 0.0006,
36 |        "betas": [0.9, 0.95],
37 |        "eps": 1.0e-8,
38 |      }
39 |    },
40 |    "min_lr": 0.00006,
41 | 
42 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43 |    "zero_optimization": {
44 |     "stage": 1,
45 |     "allgather_partitions": True,
46 |     "allgather_bucket_size": 500000000,
47 |     "overlap_comm": True,
48 |     "reduce_scatter": True,
49 |     "reduce_bucket_size": 500000000,
50 |     "contiguous_gradients": True,
51 |   },
52 | 
53 |    # batch / data settings
54 |    "train_micro_batch_size_per_gpu": 4,
55 |    "data_impl": "mmap",
56 | 
57 |    # activation checkpointing
58 |    "checkpoint_activations": true,
59 |    "checkpoint_num_layers": 1,
60 |    "partition_activations": true,
61 |    "synchronize_each_layer": true,
62 | 
63 |    # regularization
64 |    "gradient_clipping": 1.0,
65 |    "weight_decay": 0.1,
66 |    "hidden_dropout": 0.0,
67 |    "attention_dropout": 0.0,
68 | 
69 |    # precision settings
70 |    "fp16": {
71 |      "enabled": true,
72 |      "loss_scale": 0,
73 |      "loss_scale_window": 1000,
74 |      "hysteresis": 2,
75 |      "min_loss_scale": 1
76 |    },
77 | 
78 |    # misc. training settings
79 |    "train_iters": 320000,
80 |    "lr_decay_iters": 320000,
81 |    "distributed_backend": "nccl",
82 |    "lr_decay_style": "cosine",
83 |    "warmup": 0.01,
84 |    "checkpoint_factor": 10000,
85 |    "eval_interval": 1000,
86 |    "eval_iters": 10,
87 | 
88 |    # logging
89 |    "log_interval": 100,
90 |    "steps_per_print": 10,
91 |    "keep_last_n_checkpoints": 4,
92 |    "wall_clock_breakdown": true,
93 | 
94 |   #  networking
95 |   "hostfile": "/mock_path"
96 | }
97 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_cope.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 512,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "cope",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 | 
29 |    # optimizer settings
30 |    "optimizer": {
31 |      "type": "Adam",
32 |      "params": {
33 |        "lr": 0.0006,
34 |        "betas": [0.9, 0.95],
35 |        "eps": 1.0e-8,
36 |      }
37 |    },
38 |    "min_lr": 0.00006,
39 | 
40 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 |    "zero_optimization": {
42 |     "stage": 1,
43 |     "allgather_partitions": True,
44 |     "allgather_bucket_size": 500000000,
45 |     "overlap_comm": True,
46 |     "reduce_scatter": True,
47 |     "reduce_bucket_size": 500000000,
48 |     "contiguous_gradients": True,
49 |   },
50 | 
51 |    # batch / data settings
52 |    "train_micro_batch_size_per_gpu": 32,
53 |    "data-impl": "mmap",
54 | 
55 |    # activation checkpointing
56 |    "checkpoint-activations": true,
57 |    "checkpoint-num-layers": 1,
58 |    "partition-activations": true,
59 |    "synchronize-each-layer": true,
60 | 
61 |    # regularization
62 |    "gradient_clipping": 1.0,
63 |    "weight-decay": 0.1,
64 |    "hidden-dropout": 0.0,
65 |    "attention-dropout": 0.0,
66 | 
67 |    # precision settings
68 |    "fp16": {
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    # misc. training settings
77 |    "train-iters": 50000,
78 |    "lr-decay-iters": 50000,
79 |    "distributed-backend": "nccl",
80 |    "lr-decay-style": "cosine",
81 |    "warmup": 0.01,
82 |    "checkpoint-factor": 10000,
83 |    "eval-interval": 5000,
84 |    "eval-iters": 20,
85 | 
86 |    # logging
87 |    "log-interval": 100,
88 |    "steps_per_print": 10,
89 |    "keep-last-n-checkpoints": 4,
90 |    "wall_clock_breakdown": true,
91 | 
92 |   #  networking
93 |   "hostfile": "/mock_path",
94 |   "save": "checkpoints/125M_cope",
95 |   "load": "checkpoints/125M_cope",
96 |   "tensorboard-dir": "tensorboard/125M_cope",
97 |   "log-dir": "logs/125M_cope",
98 | }
99 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # wandb logs
132 | wandb/
133 | 
134 | # data files
135 | data/**/*.idx
136 | data/**/*.bin
137 | data/**/*.json*
138 | data/**/*.txt
139 | data/**/*.gz
140 | data/**/*.zip
141 | data/**/*.np*
142 | data/**/*.npy
143 | checkpoints/
144 | .vscode/
145 | *.pt
146 | *.ckpt
147 | 
148 | #test logs
149 | test_checkpoint/
150 | test_logs/
151 | logs/
152 | tensorboard/
153 | src/
154 | 
155 | # test data files
156 | tests/data/*.bin
157 | tests/data/*.idx
158 | 


--------------------------------------------------------------------------------
/configs/rwkv/170M.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # Parallelism is not yet supported for rwkv
  3 |   "pipe_parallel_size": 1,
  4 |   "model_parallel_size": 1,
  5 | 
  6 |   "num_layers": 12,
  7 |   "hidden_size": 768,
  8 |   "num_attention_heads": 12, # head_size = dim_att / num_attention_heads.
  9 |                              # head_size is 64 for all rwkv models
 10 |   "seq_length": 512,
 11 |   "max_position_embeddings": 2048,
 12 |   "output_layer_parallelism": "column",
 13 |   "norm": "rmsnorm",
 14 |   "rms_norm_epsilon": 1.0e-5,
 15 |   "train_micro_batch_size_per_gpu": 32,
 16 | 
 17 |   "attention_config": [[["rwkv"], 12]],
 18 | 
 19 |   "activation": "silu",
 20 | 
 21 |   # model settings
 22 | 
 23 |   #"pos_emb": "rotary",
 24 |   "rotary_pct": 0.25,
 25 |   "no_weight_tying": true,
 26 |   "gpt_j_residual": true,
 27 | 
 28 |   # these should provide some speedup but takes a while to build, set to true if desired
 29 |   "scaled_upper_triang_masked_softmax_fusion": false,
 30 |   "bias_gelu_fusion": false,
 31 |   "rope_fusion": false,
 32 |   "layernorm_fusion": false,
 33 | 
 34 | 
 35 |   # init methods
 36 |   "init_method": "small_init",
 37 |   "output_layer_init_method": "wang_init",
 38 | 
 39 |   # optimizer settings
 40 |   "optimizer": {
 41 |     "type": "Adam",
 42 |     "params": {
 43 |       "lr": 0.0008,
 44 |       "betas": [0.9, 0.95],
 45 |       "eps": 1.0e-8,
 46 |     }
 47 |   },
 48 |   "min_lr": 0.00008,
 49 | 
 50 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 51 |   "zero_optimization": {
 52 |     "stage": 1,
 53 |     "allgather_partitions": True,
 54 |     "allgather_bucket_size": 500000000,
 55 |     "overlap_comm": True,
 56 |     "reduce_scatter": True,
 57 |     "reduce_bucket_size": 500000000,
 58 |     "contiguous_gradients": True,
 59 |   },
 60 | 
 61 |   # batch / data settings
 62 |   "data_impl": "mmap",
 63 |   "num_workers": 1,
 64 | 
 65 |   # activation checkpointing
 66 |   "checkpoint_activations": true,
 67 |   "checkpoint_num_layers": 1,
 68 |   "partition_activations": true,
 69 |   "synchronize_each_layer": true,
 70 | 
 71 |   # regularization
 72 |   "gradient_clipping": 1.0,
 73 |   "weight_decay": 0.1,
 74 |   "hidden_dropout": 0,
 75 |   "attention_dropout": 0,
 76 | 
 77 |   # precision settings
 78 |   "bf16": {
 79 |     "bf16": true,
 80 |     "enabled": true,
 81 |     "loss_scale": 0,
 82 |     "loss_scale_window": 1000,
 83 |     "initial_scale_power": 12,
 84 |     "hysteresis": 2,
 85 |     "min_loss_scale": 1,
 86 |   },
 87 | 
 88 |   # misc. training settings
 89 |   "train_iters": 500,
 90 |   "lr_decay_iters": 500,
 91 |   "distributed_backend": "nccl",
 92 |   "lr_decay_style": "constant",
 93 |   "warmup": 0.01,
 94 |   "checkpoint_factor": 100,
 95 |   "eval_interval": 100000,
 96 |   "eval_iters": 10,
 97 | 
 98 |   # logging
 99 |   "log_interval": 10,
100 |   "steps_per_print": 10,
101 |   "wall_clock_breakdown": true,
102 | }
103 | 


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Blendable dataset."""
19 | 
20 | import time
21 | 
22 | import numpy as np
23 | import torch
24 | 
25 | from megatron import print_rank_0
26 | from megatron import mpu
27 | 
28 | 
29 | class BlendableDataset(torch.utils.data.Dataset):
30 |     def __init__(self, datasets, weights):
31 |         self.datasets = datasets
32 |         num_datasets = len(datasets)
33 |         assert num_datasets == len(weights)
34 | 
35 |         self.size = 0
36 |         for dataset in self.datasets:
37 |             self.size += len(dataset)
38 | 
39 |         # Normalize weights.
40 |         weights = np.array(weights, dtype=np.float64)
41 |         sum_weights = np.sum(weights)
42 |         assert sum_weights > 0.0
43 |         weights /= sum_weights
44 | 
45 |         # Build indices.
46 |         start_time = time.time()
47 |         assert num_datasets < 255
48 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
49 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
50 | 
51 |         from megatron.data import helpers
52 | 
53 |         helpers.build_blending_indices(
54 |             self.dataset_index,
55 |             self.dataset_sample_index,
56 |             weights,
57 |             num_datasets,
58 |             self.size,
59 |             torch.distributed.get_rank() == 0,
60 |         )
61 | 
62 |         print(
63 |             "> RANK {} elapsed time for building blendable dataset indices: "
64 |             "{:.2f} (sec)".format(
65 |                 torch.distributed.get_rank(), time.time() - start_time
66 |             )
67 |         )
68 | 
69 |     def __len__(self):
70 |         return self.size
71 | 
72 |     def __getitem__(self, idx):
73 |         try:
74 |             dataset_idx = self.dataset_index[idx]
75 |             sample_idx = self.dataset_sample_index[idx]
76 |             return self.datasets[dataset_idx][sample_idx]
77 |         except IndexError:
78 |             new_idx = idx % len(self)
79 |             print(
80 |                 f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
81 |             )
82 |             return self[new_idx]
83 | 


--------------------------------------------------------------------------------
/megatron/model/rwkv/v6/cuda/wkv6_op.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include "ATen/ATen.h"
 3 | typedef at::BFloat16 bf16;
 4 | 
 5 | void cuda_forward(int B,
 6 |                   int T,
 7 |                   int C,
 8 |                   int H,
 9 |                   bf16* r,
10 |                   bf16* k,
11 |                   bf16* v,
12 |                   float* w,
13 |                   bf16* u,
14 |                   bf16* y);
15 | void cuda_backward(int B,
16 |                    int T,
17 |                    int C,
18 |                    int H,
19 |                    bf16* r,
20 |                    bf16* k,
21 |                    bf16* v,
22 |                    float* w,
23 |                    bf16* u,
24 |                    bf16* gy,
25 |                    bf16* gr,
26 |                    bf16* gk,
27 |                    bf16* gv,
28 |                    bf16* gw,
29 |                    bf16* gu);
30 | 
31 | void forward(int64_t B,
32 |              int64_t T,
33 |              int64_t C,
34 |              int64_t H,
35 |              torch::Tensor& r,
36 |              torch::Tensor& k,
37 |              torch::Tensor& v,
38 |              torch::Tensor& w,
39 |              torch::Tensor& u,
40 |              torch::Tensor& y)
41 | {
42 |     cuda_forward(B,
43 |                  T,
44 |                  C,
45 |                  H,
46 |                  r.data_ptr<bf16>(),
47 |                  k.data_ptr<bf16>(),
48 |                  v.data_ptr<bf16>(),
49 |                  w.data_ptr<float>(),
50 |                  u.data_ptr<bf16>(),
51 |                  y.data_ptr<bf16>());
52 | }
53 | void backward(int64_t B,
54 |               int64_t T,
55 |               int64_t C,
56 |               int64_t H,
57 |               torch::Tensor& r,
58 |               torch::Tensor& k,
59 |               torch::Tensor& v,
60 |               torch::Tensor& w,
61 |               torch::Tensor& u,
62 |               torch::Tensor& gy,
63 |               torch::Tensor& gr,
64 |               torch::Tensor& gk,
65 |               torch::Tensor& gv,
66 |               torch::Tensor& gw,
67 |               torch::Tensor& gu)
68 | {
69 |     cuda_backward(B,
70 |                   T,
71 |                   C,
72 |                   H,
73 |                   r.data_ptr<bf16>(),
74 |                   k.data_ptr<bf16>(),
75 |                   v.data_ptr<bf16>(),
76 |                   w.data_ptr<float>(),
77 |                   u.data_ptr<bf16>(),
78 |                   gy.data_ptr<bf16>(),
79 |                   gr.data_ptr<bf16>(),
80 |                   gk.data_ptr<bf16>(),
81 |                   gv.data_ptr<bf16>(),
82 |                   gw.data_ptr<bf16>(),
83 |                   gu.data_ptr<bf16>());
84 | }
85 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
86 | {
87 |     m.def("forward", &forward, "wkv6 forward");
88 |     m.def("backward", &backward, "wkv6 backward");
89 | }
90 | 
91 | TORCH_LIBRARY(wkv6, m)
92 | {
93 |     m.def("forward", forward);
94 |     m.def("backward", backward);
95 | }
96 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_fire.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  3 |    # across the node boundaries )
  4 |    "pipe-parallel-size": 1,
  5 |    "model-parallel-size": 1,
  6 | 
  7 |    # model settings
  8 |    "num-layers": 12,
  9 |    "hidden-size": 768,
 10 |    "num-attention-heads": 12,
 11 |    "seq-length": 512,
 12 |    "max-position-embeddings": 2048,
 13 |    "norm": "layernorm",
 14 |    "pos-emb": "fire",
 15 |    "no-weight-tying": true,
 16 |    "gpt_j_residual": false,
 17 |    "output_layer_parallelism": "column",
 18 | 
 19 |    # these should provide some speedup but takes a while to build, set to true if desired
 20 |    "scaled-upper-triang-masked-softmax-fusion": false,
 21 |    "bias-gelu-fusion": false,
 22 |     
 23 |    "mlp_width": 32,
 24 |    "noise_seq_length": 128,
 25 |    # init methods
 26 |    "init_method": "small_init",
 27 |    "output_layer_init_method": "wang_init",
 28 | 
 29 | 
 30 |    # optimizer settings
 31 |    "optimizer": {
 32 |      "type": "Adam",
 33 |      "params": {
 34 |        "lr": 0.0006,
 35 |        "betas": [0.9, 0.95],
 36 |        "eps": 1.0e-8,
 37 |      }
 38 |    },
 39 |    "min_lr": 0.00006,
 40 | 
 41 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 42 |    "zero_optimization": {
 43 |     "stage": 1,
 44 |     "allgather_partitions": True,
 45 |     "allgather_bucket_size": 500000000,
 46 |     "overlap_comm": True,
 47 |     "reduce_scatter": True,
 48 |     "reduce_bucket_size": 500000000,
 49 |     "contiguous_gradients": True,
 50 |   },
 51 | 
 52 |    # batch / data settings
 53 |    "train_micro_batch_size_per_gpu": 32,
 54 |    "data-impl": "mmap",
 55 | 
 56 |    # activation checkpointing
 57 |    "checkpoint-activations": true,
 58 |    "checkpoint-num-layers": 1,
 59 |    "partition-activations": true,
 60 |    "synchronize-each-layer": true,
 61 | 
 62 |    # regularization
 63 |    "gradient_clipping": 1.0,
 64 |    "weight-decay": 0.1,
 65 |    "hidden-dropout": 0.0,
 66 |    "attention-dropout": 0.0,
 67 | 
 68 |    # precision settings
 69 |    "fp16": {
 70 |      "enabled": true,
 71 |      "loss_scale": 0,
 72 |      "loss_scale_window": 1000,
 73 |      "hysteresis": 2,
 74 |      "min_loss_scale": 1
 75 |    },
 76 | 
 77 |    # misc. training settings
 78 |    "train-iters": 50000,
 79 |    "lr-decay-iters": 50000,
 80 |    "distributed-backend": "nccl",
 81 |    "lr-decay-style": "cosine",
 82 |    "warmup": 0.01,
 83 |    "checkpoint-factor": 10000,
 84 |    "eval-interval": 5000,
 85 |    "eval-iters": 20,
 86 | 
 87 |    # logging
 88 |    "log-interval": 100,
 89 |    "steps_per_print": 10,
 90 |    "keep-last-n-checkpoints": 4,
 91 |    "wall_clock_breakdown": true,
 92 | 
 93 |   #  networking
 94 |   "hostfile": "/mock_path",
 95 |   "save": "checkpoints",
 96 |   "load": "checkpoints",
 97 |   "tensorboard-dir": "tensorboard",
 98 |   "log-dir": "logs",
 99 | }
100 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);
26 | 
27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
28 |                        torch::Tensor const& softmax_results,
29 |                        float scale_factor);
30 | 
31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor)
32 | {
33 |     AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
34 |     AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
35 |                    (input.scalar_type() == at::ScalarType::BFloat16),
36 |                "Only fp16 and bf16 are supported");
37 | 
38 |     return fwd_cuda(input, scale_factor);
39 | }
40 | 
41 | torch::Tensor bwd(torch::Tensor const& output_grads,
42 |                   torch::Tensor const& softmax_results,
43 |                   float scale_factor)
44 | {
45 |     AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
46 |     AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
47 | 
48 |     AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
49 |                    (output_grads.scalar_type() == at::ScalarType::BFloat16),
50 |                "Only fp16 and bf16 are supported");
51 |     AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
52 |                    (softmax_results.scalar_type() == at::ScalarType::BFloat16),
53 |                "Only fp16 and bf16 are supported");
54 | 
55 |     return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | }  // end namespace scaled_upper_triang_masked_softmax
59 | }  // end namespace fused_softmax
60 | }  // end namespace multihead_attn
61 | 
62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
63 | {
64 |     m.def("forward",
65 |           &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
66 |           "Self Multihead Attention scaled, time masked softmax -- Forward.");
67 |     m.def("backward",
68 |           &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
69 |           "Self Multihead Attention scaled, time masked softmax -- Backward.");
70 | }
71 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_usage.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | plausibility check for the usage of neox_args in the megatron codebase
17 | """
18 | import pytest
19 | import re
20 | from ..common import get_root_directory
21 | 
22 | 
23 | @pytest.mark.cpu
24 | def test_neoxargs_usage():
25 |     """ "
26 |     checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs
27 |     """
28 |     from megatron.neox_arguments import NeoXArgs
29 | 
30 |     declared_all = True
31 |     neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys())
32 | 
33 |     # we exclude a number of properties (implemented with the @property decorator) or functions that we know exists
34 |     exclude = set(
35 |         [
36 |             "params_dtype",
37 |             "deepspeed_config",
38 |             "get",
39 |             "pop",
40 |             "get_deepspeed_main_args",
41 |             'optimizer["params"]',
42 |             "attention_config[layer_number]",
43 |             "adlr_autoresume_object",
44 |             "update_value",
45 |             "all_config",
46 |             "tensorboard_writer",
47 |             "tokenizer",
48 |             "train_batch_size]",
49 |             "items",
50 |             "configure_distributed_args",
51 |             "build_tokenizer",
52 |             "attention_config[i]",
53 |             "print",
54 |             "update",
55 |         ]
56 |     )
57 | 
58 |     # test file by file
59 |     for filename in (get_root_directory() / "megatron").glob("**/*.py"):
60 |         if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]:
61 |             continue
62 | 
63 |         # load file
64 |         with open(filename, "r") as f:
65 |             file_contents = f.read()
66 | 
67 |         # find args matches
68 |         matches = list(
69 |             re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
70 |         )
71 |         if len(matches) == 0:
72 |             continue
73 | 
74 |         # compare
75 |         for match in matches:
76 |             if match not in neox_args_attributes and match not in exclude:
77 |                 print(
78 |                     f"(arguments used not found in neox args): {filename.name}: {match}",
79 |                     flush=True,
80 |                 )
81 |                 declared_all = False
82 | 
83 |     assert declared_all, "all arguments used in code defined in NeoXArgs"
84 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_alibi.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe-parallel-size": 1,
  6 |    "model-parallel-size": 1,
  7 | 
  8 |    # model settings
  9 |    "num-layers": 12,
 10 |    "hidden-size": 768,
 11 |    "num-attention-heads": 12,
 12 |    "seq-length": 512,
 13 |    "max-position-embeddings": 2048,
 14 |    "norm": "layernorm",
 15 |    "pos-emb": "alibi",
 16 |    "no-weight-tying": true,
 17 |    "gpt_j_residual": false,
 18 |    "output_layer_parallelism": "column",
 19 | 
 20 |    # these should provide some speedup but takes a while to build, set to true if desired
 21 |    "scaled-upper-triang-masked-softmax-fusion": false,
 22 |    "bias-gelu-fusion": false,
 23 | 
 24 |    "mlp_width": 32,
 25 |    "noise_seq_length": 128,
 26 |    # init methods
 27 |    "init_method": "small_init",
 28 |    "output_layer_init_method": "wang_init",
 29 | 
 30 | 
 31 |    # optimizer settings
 32 |    "optimizer": {
 33 |      "type": "Adam",
 34 |      "params": {
 35 |        "lr": 0.0006,
 36 |        "betas": [0.9, 0.95],
 37 |        "eps": 1.0e-8,
 38 |      }
 39 |    },
 40 |    "min_lr": 0.00006,
 41 | 
 42 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 43 |    "zero_optimization": {
 44 |     "stage": 1,
 45 |     "allgather_partitions": True,
 46 |     "allgather_bucket_size": 500000000,
 47 |     "overlap_comm": True,
 48 |     "reduce_scatter": True,
 49 |     "reduce_bucket_size": 500000000,
 50 |     "contiguous_gradients": True,
 51 |   },
 52 | 
 53 |    # batch / data settings
 54 |    "train_micro_batch_size_per_gpu": 32,
 55 |    "data-impl": "mmap",
 56 | 
 57 |    # activation checkpointing
 58 |    "checkpoint-activations": true,
 59 |    "checkpoint-num-layers": 1,
 60 |    "partition-activations": true,
 61 |    "synchronize-each-layer": true,
 62 | 
 63 |    # regularization
 64 |    "gradient_clipping": 1.0,
 65 |    "weight-decay": 0.1,
 66 |    "hidden-dropout": 0.0,
 67 |    "attention-dropout": 0.0,
 68 | 
 69 |    # precision settings
 70 |    "fp16": {
 71 |      "enabled": true,
 72 |      "loss_scale": 0,
 73 |      "loss_scale_window": 1000,
 74 |      "hysteresis": 2,
 75 |      "min_loss_scale": 1
 76 |    },
 77 | 
 78 |    # misc. training settings
 79 |    "train-iters": 50000,
 80 |    "lr-decay-iters": 50000,
 81 |    "distributed-backend": "nccl",
 82 |    "lr-decay-style": "cosine",
 83 |    "warmup": 0.01,
 84 |    "checkpoint-factor": 10000,
 85 |    "eval-interval": 5000,
 86 |    "eval-iters": 20,
 87 | 
 88 |    # logging
 89 |    "log-interval": 100,
 90 |    "steps_per_print": 10,
 91 |    "keep-last-n-checkpoints": 4,
 92 |    "wall_clock_breakdown": true,
 93 | 
 94 |   #  networking
 95 |   "hostfile": "/mock_path",
 96 |   "save": "checkpoints",
 97 |   "load": "checkpoints",
 98 |   "tensorboard-dir": "tensorboard",
 99 |   "log-dir": "logs",
100 | }
101 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_alibi_c.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe-parallel-size": 1,
  6 |    "model-parallel-size": 1,
  7 | 
  8 |    # model settings
  9 |    "num-layers": 12,
 10 |    "hidden-size": 768,
 11 |    "num-attention-heads": 12,
 12 |    "seq-length": 512,
 13 |    "max-position-embeddings": 2048,
 14 |    "norm": "layernorm",
 15 |    "pos-emb": "alibi_c",
 16 |    "no-weight-tying": true,
 17 |    "gpt_j_residual": false,
 18 |    "output_layer_parallelism": "column",
 19 | 
 20 |    # these should provide some speedup but takes a while to build, set to true if desired
 21 |    "scaled-upper-triang-masked-softmax-fusion": false,
 22 |    "bias-gelu-fusion": false,
 23 | 
 24 |    "mlp_width": 32,
 25 |    "noise_seq_length": 128,
 26 |    # init methods
 27 |    "init_method": "small_init",
 28 |    "output_layer_init_method": "wang_init",
 29 | 
 30 | 
 31 |    # optimizer settings
 32 |    "optimizer": {
 33 |      "type": "Adam",
 34 |      "params": {
 35 |        "lr": 0.0006,
 36 |        "betas": [0.9, 0.95],
 37 |        "eps": 1.0e-8,
 38 |      }
 39 |    },
 40 |    "min_lr": 0.00006,
 41 | 
 42 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 43 |    "zero_optimization": {
 44 |     "stage": 1,
 45 |     "allgather_partitions": True,
 46 |     "allgather_bucket_size": 500000000,
 47 |     "overlap_comm": True,
 48 |     "reduce_scatter": True,
 49 |     "reduce_bucket_size": 500000000,
 50 |     "contiguous_gradients": True,
 51 |   },
 52 | 
 53 |    # batch / data settings
 54 |    "train_micro_batch_size_per_gpu": 32,
 55 |    "data-impl": "mmap",
 56 | 
 57 |    # activation checkpointing
 58 |    "checkpoint-activations": true,
 59 |    "checkpoint-num-layers": 1,
 60 |    "partition-activations": true,
 61 |    "synchronize-each-layer": true,
 62 | 
 63 |    # regularization
 64 |    "gradient_clipping": 1.0,
 65 |    "weight-decay": 0.1,
 66 |    "hidden-dropout": 0.0,
 67 |    "attention-dropout": 0.0,
 68 | 
 69 |    # precision settings
 70 |    "fp16": {
 71 |      "enabled": true,
 72 |      "loss_scale": 0,
 73 |      "loss_scale_window": 1000,
 74 |      "hysteresis": 2,
 75 |      "min_loss_scale": 1
 76 |    },
 77 | 
 78 |    # misc. training settings
 79 |    "train-iters": 50000,
 80 |    "lr-decay-iters": 50000,
 81 |    "distributed-backend": "nccl",
 82 |    "lr-decay-style": "cosine",
 83 |    "warmup": 0.01,
 84 |    "checkpoint-factor": 10000,
 85 |    "eval-interval": 5000,
 86 |    "eval-iters": 20,
 87 | 
 88 |    # logging
 89 |    "log-interval": 100,
 90 |    "steps_per_print": 10,
 91 |    "keep-last-n-checkpoints": 4,
 92 |    "wall_clock_breakdown": true,
 93 | 
 94 |   #  networking
 95 |   "hostfile": "/mock_path",
 96 |   "save": "checkpoints",
 97 |   "load": "checkpoints",
 98 |   "tensorboard-dir": "tensorboard",
 99 |   "log-dir": "logs",
100 | }
101 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_kerple.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe-parallel-size": 1,
  6 |    "model-parallel-size": 1,
  7 | 
  8 |    # model settings
  9 |    "num-layers": 12,
 10 |    "hidden-size": 768,
 11 |    "num-attention-heads": 12,
 12 |    "seq-length": 512,
 13 |    "max-position-embeddings": 2048,
 14 |    "norm": "layernorm",
 15 |    "pos-emb": "kerple",
 16 |    "no-weight-tying": true,
 17 |    "gpt_j_residual": false,
 18 |    "output_layer_parallelism": "column",
 19 | 
 20 |    # these should provide some speedup but takes a while to build, set to true if desired
 21 |    "scaled-upper-triang-masked-softmax-fusion": false,
 22 |    "bias-gelu-fusion": false,
 23 | 
 24 |    "mlp_width": 32,
 25 |    "noise_seq_length": 128,
 26 |    # init methods
 27 |    "init_method": "small_init",
 28 |    "output_layer_init_method": "wang_init",
 29 | 
 30 | 
 31 |    # optimizer settings
 32 |    "optimizer": {
 33 |      "type": "Adam",
 34 |      "params": {
 35 |        "lr": 0.0006,
 36 |        "betas": [0.9, 0.95],
 37 |        "eps": 1.0e-8,
 38 |      }
 39 |    },
 40 |    "min_lr": 0.00006,
 41 | 
 42 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 43 |    "zero_optimization": {
 44 |     "stage": 1,
 45 |     "allgather_partitions": True,
 46 |     "allgather_bucket_size": 500000000,
 47 |     "overlap_comm": True,
 48 |     "reduce_scatter": True,
 49 |     "reduce_bucket_size": 500000000,
 50 |     "contiguous_gradients": True,
 51 |   },
 52 | 
 53 |    # batch / data settings
 54 |    "train_micro_batch_size_per_gpu": 32,
 55 |    "data-impl": "mmap",
 56 | 
 57 |    # activation checkpointing
 58 |    "checkpoint-activations": true,
 59 |    "checkpoint-num-layers": 1,
 60 |    "partition-activations": true,
 61 |    "synchronize-each-layer": true,
 62 | 
 63 |    # regularization
 64 |    "gradient_clipping": 1.0,
 65 |    "weight-decay": 0.1,
 66 |    "hidden-dropout": 0.0,
 67 |    "attention-dropout": 0.0,
 68 | 
 69 |    # precision settings
 70 |    "fp16": {
 71 |      "enabled": true,
 72 |      "loss_scale": 0,
 73 |      "loss_scale_window": 1000,
 74 |      "hysteresis": 2,
 75 |      "min_loss_scale": 1
 76 |    },
 77 | 
 78 |    # misc. training settings
 79 |    "train-iters": 50000,
 80 |    "lr-decay-iters": 50000,
 81 |    "distributed-backend": "nccl",
 82 |    "lr-decay-style": "cosine",
 83 |    "warmup": 0.01,
 84 |    "checkpoint-factor": 5000,
 85 |    "eval-interval": 5000,
 86 |    "eval-iters": 20,
 87 | 
 88 |    # logging
 89 |    "log-interval": 100,
 90 |    "steps_per_print": 10,
 91 |    "keep-last-n-checkpoints": 4,
 92 |    "wall_clock_breakdown": true,
 93 | 
 94 |   #  networking
 95 |   "hostfile": "/mock_path",
 96 |   "save": "checkpoints",
 97 |   "load": "checkpoints",
 98 |   "tensorboard-dir": "tensorboard",
 99 |   "log-dir": "logs",
100 | }
101 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_fire_c.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe-parallel-size": 1,
  6 |    "model-parallel-size": 1,
  7 | 
  8 |    # model settings
  9 |    "num-layers": 12,
 10 |    "hidden-size": 768,
 11 |    "num-attention-heads": 12,
 12 |    "seq-length": 512,
 13 |    "max-position-embeddings": 2048,
 14 |    "norm": "layernorm",
 15 |    "pos-emb": "fire_c",
 16 |    "no-weight-tying": true,
 17 |    "gpt_j_residual": false,
 18 |    "output_layer_parallelism": "column",
 19 | 
 20 |    # these should provide some speedup but takes a while to build, set to true if desired
 21 |    "scaled-upper-triang-masked-softmax-fusion": false,
 22 |    "bias-gelu-fusion": false,
 23 |     
 24 |    "mlp_width": 32,
 25 |    "noise_seq_length": 128,
 26 |    # init methods
 27 |    "init_method": "small_init",
 28 |    "output_layer_init_method": "wang_init",
 29 | 
 30 | 
 31 |    # optimizer settings
 32 |    "optimizer": {
 33 |      "type": "Adam",
 34 |      "params": {
 35 |        "lr": 0.0006,
 36 |        "betas": [0.9, 0.95],
 37 |        "eps": 1.0e-8,
 38 |      }
 39 |    },
 40 |    "min_lr": 0.00006,
 41 | 
 42 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 43 |    "zero_optimization": {
 44 |     "stage": 1,
 45 |     "allgather_partitions": True,
 46 |     "allgather_bucket_size": 500000000,
 47 |     "overlap_comm": True,
 48 |     "reduce_scatter": True,
 49 |     "reduce_bucket_size": 500000000,
 50 |     "contiguous_gradients": True,
 51 |   },
 52 | 
 53 |    # batch / data settings
 54 |    "train_micro_batch_size_per_gpu": 32,
 55 |    "data-impl": "mmap",
 56 | 
 57 |    # activation checkpointing
 58 |    "checkpoint-activations": true,
 59 |    "checkpoint-num-layers": 1,
 60 |    "partition-activations": true,
 61 |    "synchronize-each-layer": true,
 62 | 
 63 |    # regularization
 64 |    "gradient_clipping": 1.0,
 65 |    "weight-decay": 0.1,
 66 |    "hidden-dropout": 0.0,
 67 |    "attention-dropout": 0.0,
 68 | 
 69 |    # precision settings
 70 |    "fp16": {
 71 |      "enabled": true,
 72 |      "loss_scale": 0,
 73 |      "loss_scale_window": 1000,
 74 |      "hysteresis": 2,
 75 |      "min_loss_scale": 1
 76 |    },
 77 | 
 78 |    # misc. training settings
 79 |    "train-iters": 50000,
 80 |    "lr-decay-iters": 50000,
 81 |    "distributed-backend": "nccl",
 82 |    "lr-decay-style": "cosine",
 83 |    "warmup": 0.01,
 84 |    "checkpoint-factor": 10000,
 85 |    "eval-interval": 5000,
 86 |    "eval-iters": 20,
 87 | 
 88 |    # logging
 89 |    "log-interval": 100,
 90 |    "steps_per_print": 10,
 91 |    "keep-last-n-checkpoints": 4,
 92 |    "wall_clock_breakdown": true,
 93 | 
 94 |   #  networking
 95 |   "hostfile": "/mock_path",
 96 |   "save": "checkpoints",
 97 |   "load": "checkpoints",
 98 |   "tensorboard-dir": "tensorboard",
 99 |   "log-dir": "logs",
100 | }
101 | 


--------------------------------------------------------------------------------
/configs/125M-moe.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # See README for MoE config docs!
  4 |    "moe_type": "deepspeed",
  5 |    "moe_token_dropping": true,
  6 |    # Have 4 experts per layer (every 2 layers by default)
  7 |    "moe_num_experts": 4,
  8 |    # parallelism settings
  9 |    "enable_expert_tensor_parallelism": true,
 10 |    "pipe_parallel_size": 1, # not yet supported for MoE
 11 |    "model_parallel_size": 1,
 12 |    "moe_expert_parallel_size": 1,
 13 | 
 14 |    # model settings
 15 |    "num_layers": 12,
 16 |    "hidden_size": 768,
 17 |    "num_attention_heads": 12,
 18 |    "seq_length": 2048,
 19 |    "max_position_embeddings": 2048,
 20 |    "norm": "layernorm",
 21 |    "pos_emb": "rotary",
 22 |    "no_weight_tying": true,
 23 |    "gpt_j_residual": false,
 24 |    "output_layer_parallelism": "column",
 25 | 
 26 |    # these should provide some speedup but takes a while to build, set to true if desired
 27 |    "scaled_upper_triang_masked_softmax_fusion": false,
 28 |    "bias_gelu_fusion": false,
 29 |    "rope_fusion": false,
 30 | 
 31 |    # init methods
 32 |    "init_method": "small_init",
 33 |    "output_layer_init_method": "wang_init",
 34 | 
 35 | 
 36 |    # optimizer settings
 37 |    "optimizer": {
 38 |      "type": "Adam",
 39 |      "params": {
 40 |        "lr": 0.0006,
 41 |        "betas": [0.9, 0.95],
 42 |        "eps": 1.0e-8,
 43 |      }
 44 |    },
 45 |    "min_lr": 0.00006,
 46 | 
 47 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 48 |    "zero_optimization": {
 49 |     "stage": 1,
 50 |     "allgather_partitions": True,
 51 |     "allgather_bucket_size": 500000000,
 52 |     "overlap_comm": True,
 53 |     "reduce_scatter": True,
 54 |     "reduce_bucket_size": 500000000,
 55 |     "contiguous_gradients": True,
 56 |   },
 57 | 
 58 |    # batch / data settings
 59 |    "train_micro_batch_size_per_gpu": 4,
 60 |    "data_impl": "mmap",
 61 | 
 62 |    # activation checkpointing
 63 |    "checkpoint_activations": true,
 64 |    "checkpoint_num_layers": 1,
 65 |    "partition_activations": true,
 66 |    "synchronize_each_layer": true,
 67 | 
 68 |    # regularization
 69 |    "gradient_clipping": 1.0,
 70 |    "weight_decay": 0.1,
 71 |    "hidden_dropout": 0.0,
 72 |    "attention_dropout": 0.0,
 73 | 
 74 |    # precision settings
 75 |    "fp16": {
 76 |      "enabled": true,
 77 |      "loss_scale": 0,
 78 |      "loss_scale_window": 1000,
 79 |      "hysteresis": 2,
 80 |      "min_loss_scale": 1
 81 |    },
 82 | 
 83 |    # misc. training settings
 84 |    "train_iters": 320000,
 85 |    "lr_decay_iters": 320000,
 86 |    "distributed_backend": "nccl",
 87 |    "lr_decay_style": "cosine",
 88 |    "warmup": 0.01,
 89 |    "checkpoint_factor": 10000,
 90 |    "eval_interval": 1000,
 91 |    "eval_iters": 10,
 92 | 
 93 |    # logging
 94 |    "log_interval": 10,
 95 |    "steps_per_print": 10,
 96 |    "keep_last_n_checkpoints": 4,
 97 |    "wall_clock_breakdown": true,
 98 | 
 99 |   #  networking
100 |   "hostfile": "/mock_path"
101 | }
102 | 


--------------------------------------------------------------------------------
/configs/125M-dmoe.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # See README for MoE config docs!
  4 |    "moe_type": "megablocks",
  5 |    "moe_token_dropping": false,
  6 |    # Have 4 experts per layer (every 2 layers by default)
  7 |    "moe_num_experts": 4,
  8 |    # parallelism settings
  9 |    "enable_expert_tensor_parallelism": true,
 10 |    "pipe_parallel_size": 1, # not yet supported for MoE
 11 |    "model_parallel_size": 1,
 12 |    "moe_expert_parallel_size": 1,
 13 | 
 14 |    # model settings
 15 |    "num_layers": 12,
 16 |    "hidden_size": 768,
 17 |    "num_attention_heads": 12,
 18 |    "seq_length": 2048,
 19 |    "max_position_embeddings": 2048,
 20 |    "norm": "layernorm",
 21 |    "pos_emb": "rotary",
 22 |    "no_weight_tying": true,
 23 |    "gpt_j_residual": false,
 24 |    "output_layer_parallelism": "column",
 25 | 
 26 |    # these should provide some speedup but takes a while to build, set to true if desired
 27 |    "scaled_upper_triang_masked_softmax_fusion": false,
 28 |    "bias_gelu_fusion": false,
 29 |    "rope_fusion": false,
 30 | 
 31 |    # init methods
 32 |    "init_method": "small_init",
 33 |    "output_layer_init_method": "wang_init",
 34 | 
 35 | 
 36 |    # optimizer settings
 37 |    "optimizer": {
 38 |      "type": "Adam",
 39 |      "params": {
 40 |        "lr": 0.0006,
 41 |        "betas": [0.9, 0.95],
 42 |        "eps": 1.0e-8,
 43 |      }
 44 |    },
 45 |    "min_lr": 0.00006,
 46 | 
 47 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 48 |    "zero_optimization": {
 49 |     "stage": 0,
 50 |     "allgather_partitions": True,
 51 |     "allgather_bucket_size": 500000000,
 52 |     "overlap_comm": True,
 53 |     "reduce_scatter": True,
 54 |     "reduce_bucket_size": 500000000,
 55 |     "contiguous_gradients": True,
 56 |   },
 57 | 
 58 |    # batch / data settings
 59 |    "train_micro_batch_size_per_gpu": 4,
 60 |    "data_impl": "mmap",
 61 | 
 62 |    # activation checkpointing
 63 |    "checkpoint_activations": true,
 64 |    "checkpoint_num_layers": 1,
 65 |    "partition_activations": true,
 66 |    "synchronize_each_layer": true,
 67 | 
 68 |    # regularization
 69 |    "gradient_clipping": 1.0,
 70 |    "weight_decay": 0.1,
 71 |    "hidden_dropout": 0.0,
 72 |    "attention_dropout": 0.0,
 73 | 
 74 |    # precision settings
 75 |    "fp16": {
 76 |      "enabled": true,
 77 |      "loss_scale": 0,
 78 |      "loss_scale_window": 1000,
 79 |      "hysteresis": 2,
 80 |      "min_loss_scale": 1
 81 |    },
 82 | 
 83 |    # misc. training settings
 84 |    "train_iters": 320000,
 85 |    "lr_decay_iters": 320000,
 86 |    "distributed_backend": "nccl",
 87 |    "lr_decay_style": "cosine",
 88 |    "warmup": 0.01,
 89 |    "checkpoint_factor": 10000,
 90 |    "eval_interval": 1000,
 91 |    "eval_iters": 10,
 92 | 
 93 |    # logging
 94 |    "log_interval": 10,
 95 |    "steps_per_print": 10,
 96 |    "keep_last_n_checkpoints": 4,
 97 |    "wall_clock_breakdown": true,
 98 | 
 99 |   #  networking
100 |   "hostfile": "/mock_path"
101 | }
102 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe-parallel-size": 1,
  6 |    "model-parallel-size": 1,
  7 | 
  8 |    # model settings
  9 |    "num-layers": 12,
 10 |    "hidden-size": 768,
 11 |    "num-attention-heads": 12,
 12 |    "seq-length": 512,
 13 |    "max-position-embeddings": 2048,
 14 |    "norm": "layernorm",
 15 |    "pos-emb": "rotary",
 16 |    "no-weight-tying": true,
 17 |    "gpt_j_residual": false,
 18 |    "output_layer_parallelism": "column",
 19 | 
 20 |    # these should provide some speedup but takes a while to build, set to true if desired
 21 |    "scaled-upper-triang-masked-softmax-fusion": false,
 22 |    "bias-gelu-fusion": false,
 23 | 
 24 |    # init methods
 25 |    "init_method": "small_init",
 26 |    "output_layer_init_method": "wang_init",
 27 | 
 28 | 
 29 |    # optimizer settings
 30 |    "optimizer": {
 31 |      "type": "Adam",
 32 |      "params": {
 33 |        "lr": 0.0006,
 34 |        "betas": [0.9, 0.95],
 35 |        "eps": 1.0e-8,
 36 |      }
 37 |    },
 38 |    "min_lr": 0.00006,
 39 | 
 40 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 41 |    "zero_optimization": {
 42 |     "stage": 1,
 43 |     "allgather_partitions": True,
 44 |     "allgather_bucket_size": 500000000,
 45 |     "overlap_comm": True,
 46 |     "reduce_scatter": True,
 47 |     "reduce_bucket_size": 500000000,
 48 |     "contiguous_gradients": True,
 49 |   },
 50 | 
 51 |    # batch / data settings
 52 |    "train_micro_batch_size_per_gpu": 32,
 53 |    "data-impl": "mmap",
 54 |    
 55 |    "mlp_width": 32,
 56 | 
 57 |    # activation checkpointing
 58 |    "checkpoint-activations": true,
 59 |    "checkpoint-num-layers": 1,
 60 |    "partition-activations": true,
 61 |    "synchronize-each-layer": true,
 62 | 
 63 |    # regularization
 64 |    "gradient_clipping": 1.0,
 65 |    "weight-decay": 0.1,
 66 |    "hidden-dropout": 0.0,
 67 |    "attention-dropout": 0.0,
 68 | 
 69 |    # precision settings
 70 |    "fp16": {
 71 |      "enabled": true,
 72 |      "loss_scale": 0,
 73 |      "loss_scale_window": 1000,
 74 |      "hysteresis": 2,
 75 |      "min_loss_scale": 1
 76 |    },
 77 | 
 78 |    # misc. training settings
 79 |    "train-iters": 50000,
 80 |    "lr-decay-iters": 50000,
 81 |    "distributed-backend": "nccl",
 82 |    "lr-decay-style": "cosine",
 83 |    "warmup": 0.01,
 84 |    "checkpoint-factor": 10000,
 85 |    "eval-interval": 5000,
 86 |    "eval-iters": 20,
 87 | 
 88 |    # logging
 89 |    "log-interval": 100,
 90 |    "steps_per_print": 10,
 91 |    "keep-last-n-checkpoints": 4,
 92 |    "wall_clock_breakdown": true,
 93 | 
 94 |   #  networking
 95 |   "hostfile": "/mock_path",
 96 |   "save": "checkpoints/125M/none_c_ffn",
 97 |   "load": "checkpoints/125M/none_c_ffn",
 98 |   "tensorboard-dir": "tensorboard/125M/none_c_ffn",
 99 |   "log-dir": "logs/125M/none_c_ffn",
100 | }
101 | 


--------------------------------------------------------------------------------
/configs/125M/512/125M_fire_capev2.yml:
--------------------------------------------------------------------------------
  1 | # GPT-2 pretraining setup
  2 | {
  3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  4 |    # across the node boundaries )
  5 |    "pipe-parallel-size": 1,
  6 |    "model-parallel-size": 1,
  7 | 
  8 |    # model settings
  9 |    "num-layers": 12,
 10 |    "hidden-size": 768,
 11 |    "num-attention-heads": 12,
 12 |    "seq-length": 512,
 13 |    "max-position-embeddings": 2048,
 14 |    "norm": "layernorm",
 15 |    "pos-emb": "capev2",
 16 |    "no-weight-tying": true,
 17 |    "gpt_j_residual": false,
 18 |    "output_layer_parallelism": "column",
 19 | 
 20 |    # these should provide some speedup but takes a while to build, set to true if desired
 21 |    "scaled-upper-triang-masked-softmax-fusion": false,
 22 |    "bias-gelu-fusion": false,
 23 | 
 24 |    "mlp_width": 32,
 25 |    "capev2_kernel": 3,
 26 |    "noise_seq_length": 512,
 27 |    # init methods
 28 |    "init_method": "small_init",
 29 |    "output_layer_init_method": "wang_init",
 30 | 
 31 | 
 32 |    # optimizer settings
 33 |    "optimizer": {
 34 |      "type": "Adam",
 35 |      "params": {
 36 |        "lr": 0.0006,
 37 |        "betas": [0.9, 0.95],
 38 |        "eps": 1.0e-8,
 39 |      }
 40 |    },
 41 |    "min_lr": 0.00006,
 42 | 
 43 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 44 |    "zero_optimization": {
 45 |     "stage": 1,
 46 |     "allgather_partitions": True,
 47 |     "allgather_bucket_size": 500000000,
 48 |     "overlap_comm": True,
 49 |     "reduce_scatter": True,
 50 |     "reduce_bucket_size": 500000000,
 51 |     "contiguous_gradients": True,
 52 |   },
 53 | 
 54 |    # batch / data settings
 55 |    "train_micro_batch_size_per_gpu": 32,
 56 |    "data-impl": "mmap",
 57 | 
 58 |    # activation checkpointing
 59 |    "checkpoint-activations": true,
 60 |    "checkpoint-num-layers": 1,
 61 |    "partition-activations": true,
 62 |    "synchronize-each-layer": true,
 63 | 
 64 |    # regularization
 65 |    "gradient_clipping": 1.0,
 66 |    "weight-decay": 0.1,
 67 |    "hidden-dropout": 0.0,
 68 |    "attention-dropout": 0.0,
 69 | 
 70 |    # precision settings
 71 |    "fp16": {
 72 |      "enabled": true,
 73 |      "loss_scale": 0,
 74 |      "loss_scale_window": 1000,
 75 |      "hysteresis": 2,
 76 |      "min_loss_scale": 1
 77 |    },
 78 | 
 79 |    # misc. training settings
 80 |    "train-iters": 50000,
 81 |    "lr-decay-iters": 50000,
 82 |    "distributed-backend": "nccl",
 83 |    "lr-decay-style": "cosine",
 84 |    "warmup": 0.01,
 85 |    "checkpoint-factor": 10000,
 86 |    "eval-interval": 5000,
 87 |    "eval-iters": 20,
 88 | 
 89 |    # logging
 90 |    "log-interval": 100,
 91 |    "steps_per_print": 10,
 92 |    "keep-last-n-checkpoints": 4,
 93 |    "wall_clock_breakdown": true,
 94 | 
 95 |   #  networking
 96 |   "hostfile": "/mock_path",
 97 |   "save": "checkpoints",
 98 |   "load": "checkpoints",
 99 |   "tensorboard-dir": "tensorboard",
100 |   "log-dir": "logs",
101 | }
102 | 


--------------------------------------------------------------------------------