├── tests
    ├── __init__.py
    ├── neox_args
    │   ├── __init__.py
    │   ├── test_neoxargs_implementation.py
    │   └── test_neoxargs_usage.py
    ├── pytest.ini
    ├── model
    │   └── __init__.py
    └── README.md
├── tools
    ├── kill.sh
    ├── killall.sh
    ├── sync_cmd.sh
    ├── sync.sh
    ├── syncdir.sh
    ├── create_run_specific_config.py
    ├── upload.py
    └── create_eval_config.py
├── .dockerignore
├── CODEOWNERS
├── megatron
    ├── data
    │   ├── __init__.py
    │   ├── Makefile
    │   └── blendable_dataset.py
    ├── gradient_noise_scale
    │   └── __init__.py
    ├── tokenizer
    │   └── __init__.py
    ├── model
    │   ├── __init__.py
    │   ├── fused_bias_dropout.py
    │   └── norms.py
    ├── fused_kernels
    │   ├── compat.h
    │   ├── __init__.py
    │   ├── setup.py
    │   └── scaled_upper_triang_masked_softmax.cpp
    ├── __init__.py
    ├── mpu
    │   ├── random.py
    │   ├── __init__.py
    │   └── utils.py
    └── neox_arguments
    │   ├── template.py
    │   └── __init__.py
├── requirements
    ├── requirements-sparseattention.txt
    ├── requirements-flashattention.txt
    ├── requirements-onebitadam.txt
    ├── requirements-tensorboard.txt
    ├── requirements-dev.txt
    └── requirements.txt
├── MANIFEST.in
├── alon_configs
    ├── gpu
    │   ├── gpu0.yml
    │   ├── gpu1.yml
    │   ├── gpu2.yml
    │   ├── gpu3.yml
    │   ├── gpu4.yml
    │   ├── gpu5.yml
    │   ├── gpu6.yml
    │   └── gpu7.yml
    ├── run_specific
    │   ├── 1B_doremi_1B.yml
    │   ├── 1B_original.yml
    │   ├── 160m_original.yml
    │   ├── 1B_doremi_280.yml
    │   ├── 410m_original.yml
    │   ├── 160m_doremi_1B.yml
    │   ├── 160m_doremi_280.yml
    │   ├── 410m_doremi_1B.yml
    │   └── 410m_doremi_280.yml
    ├── parallelism.yml
    ├── init.yml
    ├── eval_tasks.yml
    ├── data
    │   └── openwebtext.yml
    ├── train_data_weights
    │   ├── doremi_120.yml
    │   ├── doremi_1B.yml
    │   ├── doremi_280.yml
    │   ├── original_pile.yml
    │   ├── static_1B_final.yml
    │   └── static_1B_mean.yml
    └── models
    │   ├── eval_1B_1gpu.yml
    │   ├── eval_1B_seqlen2048_1gpu.yml
    │   ├── 1B_unnamed_train_datasets.yml
    │   ├── 3B_unnamed_train_datasets.yml
    │   ├── 1B_seqlen2048_unnamed_train_datasets.yml
    │   ├── 410m.yml
    │   ├── 1B.yml
    │   ├── 1B_seqlen2048.yml
    │   ├── 1B_150B_tokens.yml
    │   ├── eval_3B_1gpu.yml
    │   ├── eval_3B_seqlen2048_1gpu.yml
    │   ├── 3B.yml
    │   ├── eval_410m_1gpu.yml
    │   ├── eval_1B_1gpu_local1.yml
    │   ├── eval_1B_1gpu_local2.yml
    │   ├── eval_1B_1gpu_local3.yml
    │   ├── eval_1B_1gpu_local4.yml
    │   ├── eval_1B_1gpu_local5.yml
    │   ├── 160m.yml
    │   └── eval_160m_1gpu.yml
├── configs
    ├── cpu_mock_config.yml
    ├── slurm_local.yml
    ├── text_generation.yml
    ├── sparse.yml
    ├── eleutherai_cluster.yml
    ├── local_setup.yml
    ├── slurm_125M.yml
    ├── 125M-json.yml
    ├── gmlp_small.yml
    ├── 19M.yml
    ├── 800M.yml
    ├── 49M.yml
    ├── bnb_125M.yml
    ├── bf16_125M.yml
    ├── 175B.yml
    ├── 350M.yml
    ├── 1-3B.yml
    ├── 2-7B.yml
    ├── 6-7B.yml
    ├── 13B.yml
    ├── 760M.yml
    ├── 125M.yml
    ├── alon_config_small.yml
    └── gen_docs.py
├── example_train_run.sh
├── scripts
    ├── convert_to_hf.sh
    ├── train_bigram.sh
    ├── evaluate_multishot.sh
    ├── train_doremi_1B.sh
    ├── train_doremi_280.sh
    ├── train_original_pile.sh
    ├── train_410m_pile.sh
    ├── train_160m_pile.sh
    ├── train_160m_exp3.sh
    ├── train_160m_naive_validation.sh
    ├── train_1B_original_unnamed_train_datasets.sh
    ├── train_1B_doremi_120_unnamed_train_datasets.sh
    ├── train_1B_doremi_280_unnamed_train_datasets.sh
    ├── train_1B_seqlen2048_static_1B_mean.sh
    ├── train_1B_seqlen2048_static_1B_final.sh
    ├── train_1B_seqlen2048_doremi_120_unnamed_train_datasets.sh
    ├── train_1B_seqlen2048_original_unnamed_train_datasets.sh
    ├── train_1B_exp3.sh
    ├── train_1B_exp3_mixed_minibatches.sh
    ├── train_1B_ema.sh
    ├── train_1B_ema_0.5smoothing.sh
    ├── train_3B_ema_0.5smoothing.sh
    ├── train_1B_ema_0.5smoothing_150B_tokens.sh
    ├── train_1B_seqlen2048_ema_0.5smoothing.sh
    └── evaluate.sh
├── eval_tasks
    └── __init__.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── cpu_ci.yml
    │   ├── pull_request.yml
    │   └── docker_build.yml
├── alon_setup.sh
├── preprocess_pile.sh
├── create_debug_data.sh
├── train.py
├── .pre-commit-config.yaml
├── deepy.py
├── CITATION.cff
├── prepare_data.py
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/kill.sh:
--------------------------------------------------------------------------------
1 | pkill -9 python
2 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | 20B_checkpoints/
2 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @EleutherAI/pm-gptneo
2 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-sparseattention.txt:
--------------------------------------------------------------------------------
1 | triton==0.4.2
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-flashattention.txt:
--------------------------------------------------------------------------------
1 | flash-attn==0.2.2
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-onebitadam.txt:
--------------------------------------------------------------------------------
1 | cupy-cuda111==8.6.0
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-tensorboard.txt:
--------------------------------------------------------------------------------
1 | tensorboard==2.5.0
2 | 


--------------------------------------------------------------------------------
/tools/killall.sh:
--------------------------------------------------------------------------------
1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | 


--------------------------------------------------------------------------------
/megatron/gradient_noise_scale/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_noise_scale import GradientNoiseScale
2 | 


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu0.yml:
--------------------------------------------------------------------------------
1 | {  
2 |   "include": "localhost:0",
3 |   "world_size": 1,
4 |   "master_port": 12340,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu1.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:1",
3 |   "world_size": 1,
4 |   "master_port": 12341,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu2.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:2",
3 |   "world_size": 1,
4 |   "master_port": 12342,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu3.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:3",
3 |   "world_size": 1,
4 |   "master_port": 12343,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu4.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:4",
3 |   "world_size": 1,
4 |   "master_port": 12344,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu5.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:5",
3 |   "world_size": 1,
4 |   "master_port": 12345,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu6.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:6",
3 |   "world_size": 1,
4 |   "master_port": 12346,
5 | }


--------------------------------------------------------------------------------
/alon_configs/gpu/gpu7.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "include": "localhost:7",
3 |   "world_size": 1,
4 |   "master_port": 12347,
5 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/1B_doremi_1B.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/1B_doremi_1B",
3 |   "wandb_group": "1B_doremi_1B"
4 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/1B_original.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/1B_original",
3 |   "wandb_group": "1B_original"
4 | }


--------------------------------------------------------------------------------
/alon_configs/parallelism.yml:
--------------------------------------------------------------------------------
1 | {
2 |   # parallelism settings
3 |   "pipe-parallel-size": 1,
4 |   "model-parallel-size": 1,
5 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/160m_original.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/160m_original",
3 |   "wandb_group": "160m_original"
4 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/1B_doremi_280.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/1B_doremi_280",
3 |   "wandb_group": "1B_doremi_280"
4 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/410m_original.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/410m_original",
3 |   "wandb_group": "410m_original"
4 | }


--------------------------------------------------------------------------------
/tests/neox_args/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | testing of implementation of command line arguments and configuration (NeoXArgs)
3 | """
4 | 


--------------------------------------------------------------------------------
/alon_configs/init.yml:
--------------------------------------------------------------------------------
1 | {
2 |   # init methods
3 |   "init_method": "small_init",
4 |   "output_layer_init_method": "wang_init",
5 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/160m_doremi_1B.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/160m_doremi_1B",
3 |   "wandb_group": "160m_doremi_1B"
4 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/160m_doremi_280.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/160m_doremi_280",
3 |   "wandb_group": "160m_doremi_280"
4 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/410m_doremi_1B.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/410m_doremi_1B",
3 |   "wandb_group": "410m_doremi_1B"
4 | }


--------------------------------------------------------------------------------
/alon_configs/run_specific/410m_doremi_280.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "save": "outputs/410m_doremi_280",
3 |   "wandb_group": "410m_doremi_280"
4 | }


--------------------------------------------------------------------------------
/alon_configs/eval_tasks.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "eval_tasks": ["lambada_openai", "piqa", "winogrande", "arc_easy", "sciq", "wikitext", "openbookqa"],
3 | }


--------------------------------------------------------------------------------
/requirements/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8==1.5.6
2 | clang-format==13.0.1
3 | pre-commit~=2.17.0
4 | pytest==6.2.3
5 | pytest-cov==2.11.1
6 | pytest-forked==1.3.0
7 | pytest-xdist
8 | transformers~=4.16.2
9 | 


--------------------------------------------------------------------------------
/configs/cpu_mock_config.yml:
--------------------------------------------------------------------------------
1 | # CPU unit tests should be independent of the presence of GPUs on the test server
2 | # host. This configuration mocks these GPU resources and other dependencies.
3 | {
4 |   "global_num_gpus": 1
5 | }
6 | 


--------------------------------------------------------------------------------
/alon_configs/data/openwebtext.yml:
--------------------------------------------------------------------------------
1 | {
2 |   "train-data-paths": ["data/openwebtext-processed_text_document"],
3 |   "valid-data-paths": ["data/openwebtext-processed_text_document"],
4 |   "test-data-paths": ["data/openwebtext-processed_text_document"],
5 | }


--------------------------------------------------------------------------------
/example_train_run.sh:
--------------------------------------------------------------------------------
1 | CONFIGS="alon_configs/data/pile_v2.yml alon_configs/models/160.yml alon_configs/init.yml alon_configs/optimizer.yml alon_configs/parallelism.yml"
2 | echo "Running with configs: ${CONFIGS}"
3 | 
4 | RUN_NAME="current_run"
5 | python3 deepy.py train.py ${CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log
6 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/configs/slurm_local.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data-path": "data/enron/enron_text_document",
 3 |   "vocab-file": "data/gpt2-vocab.json",
 4 |   "merge-file": "data/gpt2-merges.txt",
 5 |   "save": "checkpoints",
 6 |   "checkpoint_validation_with_forward_pass": false,
 7 |   "tensorboard-dir": "tensorboard",
 8 |   "log-dir": "logs",
 9 |   "use_wandb": true,
10 |   "wandb_host": "https://api.wandb.ai",
11 |   "wandb_project": "neox"
12 | }
13 | 


--------------------------------------------------------------------------------
/alon_configs/train_data_weights/doremi_120.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_data_weights": [
 3 |     0.1379,
 4 |     0.0608,
 5 |     0.0757,
 6 |     0.1905,
 7 |     0.0535,
 8 |     0.0325,
 9 |     0.038,
10 |     0.0746,
11 |     0.0327,
12 |     0.097,
13 |     0.0292,
14 |     0.0032,
15 |     0.1068,
16 |     0.0019,
17 |     0.0083,
18 |     0.0037,
19 |     0.012,
20 |     0.0084,
21 |     0.0117,
22 |     0.0093,
23 |     0.0084,
24 |     0.004
25 |     ]
26 | }


--------------------------------------------------------------------------------
/alon_configs/train_data_weights/doremi_1B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_data_weights": [
 3 |     0.1199,
 4 |     0.0149,
 5 |     0.0739,
 6 |     0.3289,
 7 |     0.0384,
 8 |     0.0129,
 9 |     0.0148,
10 |     0.0452,
11 |     0.026,
12 |     0.1461,
13 |     0.025,
14 |     0.0017,
15 |     0.0962,
16 |     0.0004,
17 |     0.0044,
18 |     0.0029,
19 |     0.0078,
20 |     0.0058,
21 |     0.0159,
22 |     0.0063,
23 |     0.0094,
24 |     0.0033
25 |   ]
26 | }


--------------------------------------------------------------------------------
/alon_configs/train_data_weights/doremi_280.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_data_weights": [
 3 |     0.6057,
 4 |     0.0046,
 5 |     0.0224,
 6 |     0.1019,
 7 |     0.0036,
 8 |     0.0179,
 9 |     0.0043,
10 |     0.0153,
11 |     0.0036,
12 |     0.0113,
13 |     0.0072,
14 |     0.0047,
15 |     0.0699,
16 |     0.0018,
17 |     0.0093,
18 |     0.0061,
19 |     0.0062,
20 |     0.0134,
21 |     0.0502,
22 |     0.0274,
23 |     0.0063,
24 |     0.007
25 |   ]
26 | }


--------------------------------------------------------------------------------
/alon_configs/train_data_weights/original_pile.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_data_weights": [
 3 |     0.1811,
 4 |     0.144,
 5 |     0.1207,
 6 |     0.1001,
 7 |     0.0896,
 8 |     0.0759,
 9 |     0.0612,
10 |     0.0513,
11 |     0.0365,
12 |     0.0307,
13 |     0.0217,
14 |     0.0155,
15 |     0.0153,
16 |     0.0124,
17 |     0.0088,
18 |     0.0075,
19 |     0.0073,
20 |     0.0062,
21 |     0.006,
22 |     0.0038,
23 |     0.003,
24 |     0.0014
25 |   ]
26 | }


--------------------------------------------------------------------------------
/scripts/convert_to_hf.sh:
--------------------------------------------------------------------------------
 1 | # Example usage:
 2 | # bash scripts/convert_to_hf.sh 3B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing_seed42 30000 alon_configs/models/3B.yml
 3 | 
 4 | METHOD=$1
 5 | STEP=$2
 6 | MODEL_CONFIG=$3
 7 | 
 8 | python3 tools/convert_sequential_to_hf.py \
 9 |     --input_dir outputs/$METHOD/global_step${STEP} \
10 |     --config_file $MODEL_CONFIG \
11 |     --output_dir outputs/$METHOD/global_step${STEP}/hf_model


--------------------------------------------------------------------------------
/alon_configs/train_data_weights/static_1B_final.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_data_weights": [
 3 |     0.0894,
 4 |     0.0316,
 5 |     0.0701,
 6 |     0.0705,
 7 |     0.0267,
 8 |     0.0155,
 9 |     0.0346,
10 |     0.0353,
11 |     0.0403,
12 |     0.0467,
13 |     0.0669,
14 |     0.0553,
15 |     0.0504,
16 |     0.0168,
17 |     0.0363,
18 |     0.07,
19 |     0.0315,
20 |     0.0604,
21 |     0.0373,
22 |     0.0451,
23 |     0.0466,
24 |     0.0228
25 |   ]
26 | }


--------------------------------------------------------------------------------
/alon_configs/train_data_weights/static_1B_mean.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_data_weights": [
 3 |     0.0816,
 4 |     0.0346,
 5 |     0.0706,
 6 |     0.0718,
 7 |     0.0293,
 8 |     0.0178,
 9 |     0.0355,
10 |     0.0332,
11 |     0.0398,
12 |     0.0461,
13 |     0.0664,
14 |     0.0507,
15 |     0.0516,
16 |     0.0157,
17 |     0.0351,
18 |     0.0632,
19 |     0.0323,
20 |     0.0607,
21 |     0.0361,
22 |     0.0549,
23 |     0.0471,
24 |     0.0261
25 |   ]
26 | }


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | git+https://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed
 2 | einops==0.3.0
 3 | ftfy==6.0.1
 4 | git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 5 | huggingface_hub==0.11.0
 6 | lm_eval==0.3.0
 7 | numpy==1.22.0
 8 | pybind11==2.6.2
 9 | regex
10 | sentencepiece
11 | six
12 | tiktoken==0.1.2
13 | tokenizers==0.12.1
14 | transformers~=4.24.0
15 | wandb==0.10.28
16 | protobuf==3.20.*
17 | best-download
18 | urllib3~=1.26.16
19 | datasets==2.16.0


--------------------------------------------------------------------------------
/scripts/train_bigram.sh:
--------------------------------------------------------------------------------
 1 | NUM_TRAIN_SAMPLES=10000000
 2 | 
 3 | OUTPUT_DIR="outputs/bigram_model"
 4 | mkdir -p $OUTPUT_DIR
 5 | 
 6 | # get dataset names from path
 7 | for f in /share/edc/home/alon_albalak/data/pile/test/*; do
 8 |     DATASET_NAME=$(basename $f)
 9 |     DATASET_NAME=${DATASET_NAME%.jsonl}
10 |     echo $DATASET_NAME
11 | 
12 |     python3 bigram_model.py \
13 |         --train \
14 |         --evaluate \
15 |         --dataset_name $DATASET_NAME \
16 |         --train_samples $NUM_TRAIN_SAMPLES \
17 |         > ${OUTPUT_DIR}/${DATASET_NAME}.log 2> ${OUTPUT_DIR}/${DATASET_NAME}.err
18 | 
19 | done


--------------------------------------------------------------------------------
/configs/text_generation.yml:
--------------------------------------------------------------------------------
 1 | # Parameters used for text generation
 2 | # Make sure `load` is specified somewhere else
 3 | {
 4 |   # Text gen type: `input-file`, `unconditional` or `interactive`
 5 |   "text-gen-type": "unconditional",
 6 | 
 7 |   # Params for all
 8 |   "maximum_tokens": 102,
 9 |   "prompt_end": "\n",
10 |   "temperature": 1.0,
11 |   "top_p": 0.0,
12 |   "top_k": 0,
13 |   "recompute": false,
14 | 
15 |   # `unconditional`: samples
16 |   "num-samples": 10,
17 | 
18 |   # input/output file
19 |   "sample-input-file": "sample_input.txt",
20 |   "sample-output-file": "sample_output.txt",
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/sparse.yml:
--------------------------------------------------------------------------------
 1 | # Add this to your config for sparse attention every other layer
 2 | {
 3 |   "attention_config": [[["local", "global"], "all"]],
 4 | 
 5 |   # sparsity config:
 6 |   # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
 7 |   # illustrative purposes)
 8 |   # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
 9 |   # more detailed config instructions and available parameters
10 | 
11 |   "sparsity_config": {
12 |     "block": 16, # block size
13 |     "num_local_blocks": 32,
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/eval_tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness
16 | 


--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [pytest]
16 | markers =
17 |     cpu: marks tests that can be run on cpu
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: feature request
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from .tokenizer import build_tokenizer
17 | 


--------------------------------------------------------------------------------
/alon_setup.sh:
--------------------------------------------------------------------------------
 1 | conda activate base
 2 | conda create --name gptneox python=3.8
 3 | conda activate gptneox
 4 | conda install pytorch pytorch-cuda=12.1 -c pytorch-nightly -c nvidia
 5 | pip install -r requirements/requirements.txt
 6 | pip install -r requirements/requirements-flashattention.txt
 7 | python ./megatron/fused_kernels/setup.py install
 8 | 
 9 | sed -i 's/from torch._six import inf/from torch import inf/g' ${CONDA_PREFIX}/lib/python3.8/site-packages/deepspeed/runtime/utils.py
10 | sed -i 's/from torch._six import inf/from torch import inf/g' ${CONDA_PREFIX}/lib/python3.8/site-packages/deepspeed/runtime/zero/stage2.py
11 | sed -i 's/from torch._six import inf/from torch import inf/g' ${CONDA_PREFIX}/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py
12 | mkdir outputs


--------------------------------------------------------------------------------
/scripts/evaluate_multishot.sh:
--------------------------------------------------------------------------------
 1 | RUN_NAME=$1
 2 | MODEL_CONFIG_EVAL=$2
 3 | STEP=$3
 4 | 
 5 | # MODEL_CONFIG_EVAL is the file name in the alon_configs/models/ directory
 6 | # e.g. alon_configs/models/eval_1B_1gpu.yml
 7 | # OR alon_configs/models/eval_3B_seqlen2048_1gpu.yml
 8 | 
 9 | # evaluate 0-shot
10 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step${STEP}/configs/${RUN_NAME}.yml alon_configs/models/${MODEL_CONFIG_EVAL}.yml ${STEP} 0 2>&1 | tee outputs/${RUN_NAME}_${STEP}_eval.log &
11 | # evaluate 1-shot through 5-shot
12 | for i in {1..5}; do
13 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step${STEP}/configs/${RUN_NAME}.yml alon_configs/models/${MODEL_CONFIG_EVAL}.yml ${STEP} ${i} ${i} 2>&1 | tee outputs/${RUN_NAME}_${STEP}_${i}shot_eval.log &
14 | done
15 | wait < <(jobs -p)


--------------------------------------------------------------------------------
/tests/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .test_model_instantiation import run_test_model_instantiation
16 | from .test_model_train import run_train_test
17 | from .test_model_checkpoint import run_checkpoint_test
18 | 


--------------------------------------------------------------------------------
/tools/sync_cmd.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Runs a command in parallel across all nodes
18 | # Usage
19 | # sync_cmd.sh 'echo "hello world"'
20 | 
21 | echo "Command: $1";
22 | pdsh -R ssh -w ^/job/hosts $1
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Proposed solution**
24 | If you have an idea for how we can fix this problem, describe it here.
25 | 
26 | **Screenshots**
27 | If applicable, add screenshots to help explain your problem.
28 | 
29 | **Environment (please complete the following information):**
30 |  - GPUs:
31 | - Configs:
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/tools/sync.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     echo Uploading $full_path
27 |     pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
28 | done
29 | 


--------------------------------------------------------------------------------
/.github/workflows/cpu_ci.yml:
--------------------------------------------------------------------------------
 1 | name: "Run CPU Tests"
 2 | 
 3 | on: "push"
 4 | 
 5 | jobs:
 6 |   run-tests:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 | 
11 |       - name: Install Python
12 |         uses: actions/setup-python@v4
13 |         with:
14 |           python-version: "3.8"
15 | 
16 |       - name: Upgrade Pip
17 |         run: python -m pip install --upgrade pip
18 | 
19 |       - name: Install Dependencies
20 |         run: |
21 |           sudo apt-get install libopenmpi-dev -y
22 |           pip install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu
23 |           pip install -r requirements/requirements.txt
24 |           pip install -r requirements/requirements-dev.txt
25 | 
26 |       - name: Prepare Data
27 |         run: python prepare_data.py
28 | 
29 |       - name: Run CPU Tests
30 |         run: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu
31 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from .gpt2_model import GPT2ModelPipe
19 | from .utils import get_params_for_weight_decay_optimization
20 | from .word_embeddings import SoftEmbedding
21 | 


--------------------------------------------------------------------------------
/tools/syncdir.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     parentdir="$(dirname "$full_path")"
27 |     echo Uploading $full_path to $parentdir
28 |     pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
29 | done
30 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied from NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/preprocess_pile.sh:
--------------------------------------------------------------------------------
 1 | domains=$1
 2 | 
 3 | for split in train validation test;
 4 | do
 5 |     SPLIT_PATH="/share/edc/home/alon_albalak/data/pile/${split}/"
 6 |     for DATASET_NAME in ${domains};
 7 |     do
 8 |         DATASET_PATH="${SPLIT_PATH}${DATASET_NAME}.jsonl"
 9 |         echo "path: dataset path: ${DATASET_PATH}"
10 |         echo "name: dataset name: ${DATASET_NAME}"
11 |         echo "outputting to: /share/edc/home/alon_albalak/data/pile/preprocessed/${DATASET_NAME}"
12 | 
13 |         OUTPUT_DIR=/share/edc/home/alon_albalak/data/pile/preprocessed/$split/${DATASET_NAME}
14 |         mkdir -p ${OUTPUT_DIR}
15 | 
16 |         python tools/preprocess_data.py \
17 |             --input $DATASET_PATH \
18 |             --output-prefix ${OUTPUT_DIR}/${DATASET_NAME} \
19 |             --vocab-file /share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json \
20 |             --dataset-impl mmap \
21 |             --tokenizer-type HFTokenizer \
22 |             --workers 24 \
23 |             --append-eod 2>&1 | tee ${OUTPUT_DIR}.log
24 |     done
25 | done


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_implementation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | check implementation of NeoXArgs for duplication errors (would overwrite)
17 | """
18 | import pytest
19 | 
20 | 
21 | @pytest.mark.cpu
22 | def test_neoxargs_duplicates():
23 |     """
24 |     tests that there are no duplicates among parent classes of NeoXArgs
25 |     """
26 |     from megatron import NeoXArgs
27 | 
28 |     assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates"
29 | 


--------------------------------------------------------------------------------
/tools/create_run_specific_config.py:
--------------------------------------------------------------------------------
 1 | # Simple utility to create a yml config file from command line arguments.
 2 | # Handles int, float, bool, and string arguments.
 3 | 
 4 | import sys
 5 | import os
 6 | import json
 7 | 
 8 | RUN_SPECIFIC_CONFIG_PATH="alon_configs/run_specific"
 9 | 
10 | print(sys.argv[1:])
11 | config = {}
12 | for k, v in zip(sys.argv[1::2], sys.argv[2::2]):
13 |     k = k.replace("--", "")
14 |     # first, handle numeric inputs
15 |     try:
16 |         f_v = float(v)
17 |         if f_v.is_integer():
18 |             v = int(v)
19 |         else:
20 |             v = float(v)
21 |     # if not numeric, then convert bools
22 |     except:
23 |         if v.lower() == "true":
24 |             v = True
25 |         elif v.lower() == "false":
26 |             v = False
27 |     # otherwise, it's a string and we do nothing
28 |     config[k] = v
29 | 
30 | print(config)
31 | 
32 | assert("save" in config)
33 | save_path = os.path.join(RUN_SPECIFIC_CONFIG_PATH, config["save"].split("/")[-1] + ".yml")
34 | with open(save_path, "w") as f:
35 |     json.dump(config, f, indent=2)
36 | 
37 | 


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | 
16 | 
17 | def print_rank_0(*message):
18 |     """If distributed is initialized print only on rank 0."""
19 |     if torch.distributed.is_initialized():
20 |         if torch.distributed.get_rank() == 0:
21 |             print(*message, flush=True)
22 |     else:
23 |         print(*message, flush=True)
24 | 
25 | 
26 | from .initialize import initialize_megatron
27 | from .neox_arguments import NeoXArgs
28 | 


--------------------------------------------------------------------------------
/scripts/train_doremi_1B.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/train_data_weights/doremi_1B.yml"
 3 | # Doesn't include alon_configs/eval_tasks.yml or  alon_configs/parallelism.yml
 4 | 
 5 | # RUN SPECIFIC CONFIGS
 6 | CONFIGS_160M="alon_configs/models/160m.yml alon_configs/run_specific/160m_doremi_1B.yml"
 7 | CONFIGS_410M="alon_configs/models/410m.yml alon_configs/run_specific/410m_doremi_1B.yml"
 8 | CONFIGS_1B="alon_configs/models/1B.yml alon_configs/run_specific/1B_doremi_1B.yml"
 9 | 
10 | echo "Running with configs: ${CONFIGS} ${CONFIGS_160M}"
11 | RUN_NAME="160m_doremi_1B"
12 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_160M} 2>&1 | tee outputs/${RUN_NAME}.log
13 | 
14 | echo "Running with configs: ${CONFIGS} ${CONFIGS_410M}"
15 | RUN_NAME="410m_doremi_1B"
16 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_410M} 2>&1 | tee outputs/${RUN_NAME}.log
17 | 
18 | echo "Running with configs: ${CONFIGS} ${CONFIGS_1B}"
19 | RUN_NAME="1B_doremi_1B"
20 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_1B} 2>&1 | tee outputs/${RUN_NAME}.log


--------------------------------------------------------------------------------
/scripts/train_doremi_280.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/train_data_weights/doremi_280.yml"
 3 | # Doesn't include alon_configs/eval_tasks.yml or  alon_configs/parallelism.yml
 4 | 
 5 | # RUN SPECIFIC CONFIGS
 6 | CONFIGS_160M="alon_configs/models/160m.yml alon_configs/run_specific/160m_doremi_280.yml"
 7 | CONFIGS_410M="alon_configs/models/410m.yml alon_configs/run_specific/410m_doremi_280.yml"
 8 | CONFIGS_1B="alon_configs/models/1B.yml alon_configs/run_specific/1B_doremi_280.yml"
 9 | 
10 | echo "Running with configs: ${CONFIGS} ${CONFIGS_160M}"
11 | RUN_NAME="160m_doremi_280"
12 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_160M} 2>&1 | tee outputs/${RUN_NAME}.log
13 | 
14 | echo "Running with configs: ${CONFIGS} ${CONFIGS_410M}"
15 | RUN_NAME="410m_doremi_280"
16 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_410M} 2>&1 | tee outputs/${RUN_NAME}.log
17 | 
18 | echo "Running with configs: ${CONFIGS} ${CONFIGS_1B}"
19 | RUN_NAME="1B_doremi_280"
20 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_1B} 2>&1 | tee outputs/${RUN_NAME}.log


--------------------------------------------------------------------------------
/scripts/train_original_pile.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/eval_tasks.yml or  alon_configs/parallelism.yml
 4 | 
 5 | # RUN SPECIFIC CONFIGS
 6 | CONFIGS_160M="alon_configs/models/160m.yml alon_configs/run_specific/160m_original.yml"
 7 | CONFIGS_410M="alon_configs/models/410m.yml alon_configs/run_specific/410m_original.yml"
 8 | CONFIGS_1B="alon_configs/models/1B.yml alon_configs/run_specific/1B_original.yml"
 9 | 
10 | echo "Running with configs: ${CONFIGS} ${CONFIGS_160M}"
11 | RUN_NAME="160m_original"
12 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_160M} 2>&1 | tee outputs/${RUN_NAME}.log
13 | 
14 | echo "Running with configs: ${CONFIGS} ${CONFIGS_410M}"
15 | RUN_NAME="410m_original"
16 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_410M} 2>&1 | tee outputs/${RUN_NAME}.log
17 | 
18 | echo "Running with configs: ${CONFIGS} ${CONFIGS_1B}"
19 | RUN_NAME="1B_original"
20 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_1B} 2>&1 | tee outputs/${RUN_NAME}.log
21 | 


--------------------------------------------------------------------------------
/create_debug_data.sh:
--------------------------------------------------------------------------------
 1 | SHARE_DIR=/share/edc/home/alon_albalak/data/pile/debug
 2 | mkdir -p ${SHARE_DIR}
 3 | 
 4 | for split in train validation test; do
 5 |     mkdir -p ${SHARE_DIR}/$split
 6 |     mkdir -p ${SHARE_DIR}/preprocessed/$split
 7 |     for DATASET_FILE in $(ls /share/edc/home/alon_albalak/data/pile/$split); do
 8 |         DATASET_NAME=${DATASET_FILE::-6}
 9 |         echo ${DATASET_FILE}
10 |         echo ${DATASET_NAME}
11 |         head -n 200 /share/edc/home/alon_albalak/data/pile/$split/${DATASET_FILE} > ${SHARE_DIR}/$split/${DATASET_FILE}
12 |         
13 |         OUTPUT_DIR=${SHARE_DIR}/preprocessed/$split/${DATASET_NAME}
14 |         mkdir -p ${OUTPUT_DIR}
15 | 
16 |         python tools/preprocess_data.py \
17 |             --input ${SHARE_DIR}/$split/${DATASET_FILE} \
18 |             --output-prefix ${OUTPUT_DIR}/${DATASET_NAME} \
19 |             --vocab-file /share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json \
20 |             --dataset-impl mmap \
21 |             --tokenizer-type HFTokenizer \
22 |             --append-eod
23 |             # 2>&1 | tee /share/edc/home/alon_albalak/data/pile/preprocessed/${DATASET_NAME}.log
24 |     done
25 | done


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | name: Pull Request
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-20.04
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - uses: actions/setup-python@v2
11 |         with:
12 |           python-version: 3.8
13 |       - uses: pre-commit/action@v2.0.3
14 | 
15 |   update-documentation:
16 |     runs-on: ubuntu-20.04
17 |     steps:
18 |       - uses: actions/checkout@v3
19 |         with:
20 |           ref: ${{ github.event.pull_request.head.ref}}
21 |       - run: |
22 |           rm megatron/__init__.py
23 |           pip install shortuuid
24 |           rm megatron/neox_arguments/__init__.py
25 |           python configs/gen_docs.py
26 |           git config user.name github-actions
27 |           git config user.email github-actions@github.com
28 |           git add configs/neox_arguments.md
29 |           git commit -m "Update NeoXArgs docs automatically"
30 |           git push
31 |   run-tests:
32 |     runs-on: self-hosted
33 |     steps:
34 |       - uses: actions/checkout@v2
35 |       - name: prepare data
36 |         run: python prepare_data.py
37 |       - name: Run Tests
38 |         run: pytest --forked tests
39 | 


--------------------------------------------------------------------------------
/scripts/train_410m_pile.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/models/410m.yml alon_configs/init.yml alon_configs/optimizer.yml"
 3 | # Doesn't include alon_configs/eval_tasks.yml or  alon_configs/parallelism.yml
 4 | 
 5 | # RUN SPECIFIC CONFIGS
 6 | ORIGINAL_WEIGHT_CONFIGS="alon_configs/train_data_weights/original_pile.yml alon_configs/run_specific/410m_original.yml"
 7 | DOREMI_280_CONFIGS="alon_configs/train_data_weights/doremi_280.yml alon_configs/run_specific/410m_doremi_280.yml"
 8 | DOREMI_1B_CONFIGS="alon_configs/train_data_weights/doremi_1B.yml alon_configs/run_specific/410m_doremi_1B.yml"
 9 | 
10 | echo "Running with configs: ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS}"
11 | RUN_NAME="410m_original"
12 | python3 deepy.py train.py ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log
13 | 
14 | echo "Running with configs: ${CONFIGS} ${DOREMI_280_CONFIGS}"
15 | RUN_NAME="410m_doremi_280"
16 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_280_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log
17 | 
18 | echo "Running with configs: ${CONFIGS} ${DOREMI_1B_CONFIGS}"
19 | RUN_NAME="410m_doremi_1B"
20 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_1B_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log


--------------------------------------------------------------------------------
/scripts/train_160m_pile.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/models/160m.yml alon_configs/init.yml alon_configs/optimizer.yml"
 3 | # Doesn't include alon_configs/eval_tasks.yml or alon_configs/parallelism.yml
 4 | 
 5 | 
 6 | # RUN SPECIFIC CONFIGS
 7 | ORIGINAL_WEIGHT_CONFIGS="alon_configs/train_data_weights/original_pile.yml alon_configs/run_specific/160m_original.yml"
 8 | DOREMI_280_CONFIGS="alon_configs/train_data_weights/doremi_280.yml alon_configs/run_specific/160m_doremi_280.yml"
 9 | DOREMI_1B_CONFIGS="alon_configs/train_data_weights/doremi_1B.yml alon_configs/run_specific/160m_doremi_1B.yml"
10 | 
11 | echo "Running with configs: ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS}"
12 | RUN_NAME="160m_original"
13 | python3 deepy.py train.py ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log
14 | 
15 | echo "Running with configs: ${CONFIGS} ${DOREMI_280_CONFIGS}"
16 | RUN_NAME="160m_doremi_280"
17 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_280_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 | echo "Running with configs: ${CONFIGS} ${DOREMI_1B_CONFIGS}"
20 | RUN_NAME="160m_doremi_1B"
21 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_1B_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log


--------------------------------------------------------------------------------
/configs/eleutherai_cluster.yml:
--------------------------------------------------------------------------------
 1 | # Data paths and options when using EleutherAI cluster
 2 | {
 3 |   # you may include multiple distinct datasets if desired
 4 |   "train-data-paths": ["/mnt/ssd-1/data/enron/enron_train_text_document"],
 5 |   "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_val_text_document"],
 6 |   "test-data-paths": ["/mnt/ssd-1/data/enron/enron_test_text_document"],
 7 | 
 8 |   # if using multiple datasets, provide weights for them to be sampled with
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 | 
14 |   # If you would like the code to create val and test datasets from your training set use the following instead
15 |   # "split" determines the relative size of train, val, and test
16 | 
17 |   # "split" 995,4,1
18 |   # "data_path": "/mnt/ssd-1/data/enron/enron_train_text_document",
19 | 
20 |   "vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json",
21 |   "merge-file": "/mnt/ssd-1/data/gpt2-merges.txt",
22 |   "save": "/mnt/ssd-1/checkpoints",
23 |   "load": "/mnt/ssd-1/checkpoints",
24 |   "tensorboard-dir": "/mnt/ssd-1/tensorboard",
25 |   "log-dir": "/mnt/ssd-1/logs",
26 |   "wandb_team": "eleutherai",
27 |   "wandb_project": "neox",
28 |   "wandb_group": "example"
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/train_160m_exp3.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/160m.yml alon_configs/eval_tasks.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="160m_ods"
 6 | DATA_SAMPLING_METHOD="exp3"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | 
10 | SEEDS=(1234 42 100 222)
11 | 
12 | # RUN SPECIFIC CONFIGS
13 | for SEED in ${SEEDS[@]}; do
14 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
15 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY}"
16 |     python3 tools/create_run_specific_config.py ${ARGS}
17 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
18 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
19 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
20 | 
21 |     # evaluate
22 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_160m_1gpu.yml
23 | done


--------------------------------------------------------------------------------
/configs/local_setup.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data-path": "data/enron/enron_text_document",
 4 | 
 5 |   # or for weighted datasets:
 6 |   # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 7 |   # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 8 |   # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
14 |   # WARNING: setting this to True will override any user provided weights
15 |   # "weight_by_num_documents": false,
16 |   # "weighted_sampler_alpha": 0.3,
17 | 
18 |   "vocab-file": "data/gpt2-vocab.json",
19 |   "merge-file": "data/gpt2-merges.txt",
20 | 
21 |   "save": "checkpoints",
22 |   "load": "checkpoints",
23 |   "checkpoint_validation_with_forward_pass": False,
24 | 
25 |   "tensorboard-dir": "tensorboard",
26 |   "log-dir": "logs",
27 |   "use_wandb": True,
28 |   "wandb_host": "https://api.wandb.ai",
29 |   "wandb_project": "neox"
30 | }
31 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Train"""
19 | from megatron.neox_arguments import NeoXArgs
20 | from megatron.training import pretrain
21 | 
22 | if __name__ == "__main__":
23 |     neox_args = NeoXArgs.consume_neox_args()
24 |     neox_args.configure_distributed_args()
25 |     neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
26 |     neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
27 |     pretrain(neox_args=neox_args)
28 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v4.1.0
 4 |       hooks:
 5 |           - id: check-case-conflict
 6 |           - id: check-json
 7 |           - id: check-symlinks
 8 |           - id: check-yaml
 9 |           - id: destroyed-symlinks
10 |           - id: end-of-file-fixer
11 |             exclude: docs/CNAME
12 |           - id: fix-byte-order-marker
13 |           - id: fix-encoding-pragma
14 |             args: [--remove]
15 |           - id: mixed-line-ending
16 |             args: [--fix=lf]
17 |           - id: requirements-txt-fixer
18 |           - id: trailing-whitespace
19 |     - repo: https://gitlab.com/daverona/pre-commit-cpp
20 |       rev: 0.8.0
21 |       hooks:
22 |           - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
23 |             args: []
24 | 
25 |     - repo: https://github.com/psf/black
26 |       rev: 22.3.0
27 |       hooks:
28 |           - id: black
29 |             language_version: python3
30 |     - repo: https://github.com/codespell-project/codespell
31 |       rev: v2.1.0
32 |       hooks:
33 |       - id: codespell
34 |         args: [
35 |               '--ignore-words-list=reord,dout',  # Word used in error messages that need rewording
36 |               --check-filenames,
37 |               --check-hidden,
38 |           ]
39 | 


--------------------------------------------------------------------------------
/deepy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) 2021, EleutherAI
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | import os
18 | 
19 | import deepspeed
20 | from deepspeed.launcher.runner import main
21 | 
22 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
23 | 
24 | from megatron.neox_arguments import NeoXArgs
25 | from megatron.utils import get_wandb_api_key
26 | 
27 | 
28 | neox_args = NeoXArgs.consume_deepy_args()
29 | deepspeed_main_args = neox_args.get_deepspeed_main_args()
30 | 
31 | # Extract wandb API key and inject into worker environments
32 | wandb_token = get_wandb_api_key(neox_args=neox_args)
33 | if wandb_token is not None:
34 |     deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35 |     os.environ["WANDB_API_KEY"] = wandb_token
36 | 
37 | if __name__ == "__main__":
38 |     main(deepspeed_main_args)
39 | 


--------------------------------------------------------------------------------
/.github/workflows/docker_build.yml:
--------------------------------------------------------------------------------
 1 | name: docker_build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - '**'
 7 | 
 8 | jobs:
 9 |   main:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       -
13 |         name: Checkout
14 |         uses: actions/checkout@v2
15 | 
16 |       -
17 |         name: Docker meta
18 |         id: docker_meta
19 |         uses: crazy-max/ghaction-docker-meta@v1
20 |         with:
21 |           images: leogao2/gpt-neox # list of Docker images to use as base name for tags
22 |           tag-sha: true # add git short SHA as Docker tag
23 | 
24 |       -
25 |         name: Set up QEMU
26 |         uses: docker/setup-qemu-action@v1
27 | 
28 |       -
29 |         name: Set up Docker Buildx
30 |         uses: docker/setup-buildx-action@v1
31 | 
32 |       -
33 |         name: Login to DockerHub
34 |         uses: docker/login-action@v1
35 |         with:
36 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
37 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
38 | 
39 |       -
40 |         name: Build and push
41 |         id: docker_build
42 |         uses: docker/build-push-action@v2
43 |         with:
44 |           push: ${{ github.event_name != 'pull_request' }}
45 |           tags: ${{ steps.docker_meta.outputs.tags }}
46 |           labels: ${{ steps.docker_meta.outputs.labels }}
47 | 
48 |       -
49 |         name: Image digest
50 |         run: echo ${{ steps.docker_build.outputs.digest }}
51 | 


--------------------------------------------------------------------------------
/scripts/train_160m_naive_validation.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/160m.yml alon_configs/eval_tasks.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="160m_ods_naive_validation_10"
 6 | DATA_SAMPLING_METHOD="naive_validation"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="10"
 9 | MIXED_BATCHES=true
10 | VALIDATION_BASED_REWARD=true
11 | 
12 | SEEDS=(1234 42 100 222)
13 | 
14 | # RUN SPECIFIC CONFIGS
15 | for SEED in ${SEEDS[@]}; do
16 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
17 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_batches ${MIXED_BATCHES} --validation_based_reward ${VALIDATION_BASED_REWARD}"
18 |     python3 tools/create_run_specific_config.py ${ARGS}
19 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
20 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
21 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
22 | 
23 |     # evaluate
24 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_160m_1gpu.yml
25 | done


--------------------------------------------------------------------------------
/scripts/train_1B_original_unnamed_train_datasets.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_original_unnamed_train_datasets"
 6 | 
 7 | SEEDS=(1234 42 100 222)
 8 | 
 9 | # RUN SPECIFIC CONFIGS
10 | for SEED in ${SEEDS[@]}; do
11 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
12 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
13 |     python3 tools/create_run_specific_config.py ${ARGS}
14 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
15 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
16 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
17 | 
18 |     # evaluate 0-shot
19 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
20 |     # evaluate 1-shot through 5-shot
21 |     for i in {1..5}; do
22 |         bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
23 |     done
24 |     wait < <(jobs -p)
25 | done


--------------------------------------------------------------------------------
/scripts/train_1B_doremi_120_unnamed_train_datasets.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/doremi_120.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_doremi_120_unnamed_train_datasets"
 6 | 
 7 | # SEEDS=(1234 42 100 222)
 8 | SEEDS=( 42 )
 9 | 
10 | # RUN SPECIFIC CONFIGS
11 | for SEED in ${SEEDS[@]}; do
12 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
13 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
14 |     python3 tools/create_run_specific_config.py ${ARGS}
15 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
16 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
17 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 |     # evaluate 0-shot
20 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
21 |     # evaluate 1-shot through 5-shot
22 |     for i in {1..5}; do
23 |         bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
24 |     done
25 |     wait < <(jobs -p)
26 | done


--------------------------------------------------------------------------------
/scripts/train_1B_doremi_280_unnamed_train_datasets.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/doremi_280.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_doremi_280_unnamed_train_datasets"
 6 | 
 7 | SEEDS=(1234 42 100 222)
 8 | # SEEDS=( 1234 222 )
 9 | 
10 | # RUN SPECIFIC CONFIGS
11 | for SEED in ${SEEDS[@]}; do
12 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
13 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
14 |     python3 tools/create_run_specific_config.py ${ARGS}
15 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
16 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
17 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 |     # evaluate 0-shot
20 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
21 |     # evaluate 1-shot through 5-shot
22 |     for i in {1..5}; do
23 |         bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
24 |     done
25 |     wait < <(jobs -p)
26 | done


--------------------------------------------------------------------------------
/scripts/train_1B_seqlen2048_static_1B_mean.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/static_1B_mean.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_seqlen2048_static_1B_mean"
 6 | 
 7 | # SEEDS=(1234 42 100 222)
 8 | SEEDS=( 42 )
 9 | 
10 | # RUN SPECIFIC CONFIGS
11 | for SEED in ${SEEDS[@]}; do
12 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
13 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
14 |     python3 tools/create_run_specific_config.py ${ARGS}
15 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
16 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
17 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 |     # # evaluate 0-shot
20 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
21 |     # # evaluate 1-shot through 5-shot
22 |     # for i in {1..5}; do
23 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
24 |     # done
25 |     # wait < <(jobs -p)
26 | done


--------------------------------------------------------------------------------
/scripts/train_1B_seqlen2048_static_1B_final.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/static_1B_final.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_seqlen2048_static_1B_final"
 6 | 
 7 | # SEEDS=(1234 42 100 222)
 8 | SEEDS=( 42 )
 9 | 
10 | # RUN SPECIFIC CONFIGS
11 | for SEED in ${SEEDS[@]}; do
12 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
13 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
14 |     python3 tools/create_run_specific_config.py ${ARGS}
15 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
16 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
17 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 |     # # evaluate 0-shot
20 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
21 |     # # evaluate 1-shot through 5-shot
22 |     # for i in {1..5}; do
23 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
24 |     # done
25 |     # wait < <(jobs -p)
26 | done


--------------------------------------------------------------------------------
/scripts/train_1B_seqlen2048_doremi_120_unnamed_train_datasets.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/doremi_120.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_seqlen2048_doremi_120_unnamed_train_datasets"
 6 | 
 7 | # SEEDS=(1234 42 100 222)
 8 | SEEDS=( 42 )
 9 | 
10 | # RUN SPECIFIC CONFIGS
11 | for SEED in ${SEEDS[@]}; do
12 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
13 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
14 |     python3 tools/create_run_specific_config.py ${ARGS}
15 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
16 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
17 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 |     # # evaluate 0-shot
20 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
21 |     # # evaluate 1-shot through 5-shot
22 |     # for i in {1..5}; do
23 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
24 |     # done
25 |     # wait < <(jobs -p)
26 | done


--------------------------------------------------------------------------------
/scripts/train_1B_seqlen2048_original_unnamed_train_datasets.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_seqlen2048_original_unnamed_train_datasets"
 6 | 
 7 | # SEEDS=(1234 42 100 222)
 8 | SEEDS=( 42 )
 9 | 
10 | # RUN SPECIFIC CONFIGS
11 | for SEED in ${SEEDS[@]}; do
12 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
13 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}"
14 |     python3 tools/create_run_specific_config.py ${ARGS}
15 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
16 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
17 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
18 | 
19 |     # # evaluate 0-shot
20 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
21 |     # # evaluate 1-shot through 5-shot
22 |     # for i in {1..5}; do
23 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
24 |     # done
25 |     # wait < <(jobs -p)
26 | done


--------------------------------------------------------------------------------
/scripts/train_1B_exp3.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_ods"
 6 | DATA_SAMPLING_METHOD="exp3"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | 
10 | # SEEDS=(1234 42 100 222)
11 | SEEDS=( 1234 )
12 | 
13 | # RUN SPECIFIC CONFIGS
14 | for SEED in ${SEEDS[@]}; do
15 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
16 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY}"
17 |     python3 tools/create_run_specific_config.py ${ARGS}
18 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
19 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
20 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
21 | 
22 |     # evaluate 0-shot
23 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log
24 |     # evaluate 5-shot
25 |     NUM_SHOTS=5
26 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml ${NUM_SHOTS} 2>&1 | tee outputs/${RUN_NAME}_${NUM_SHOTS}shot_eval.log
27 | done


--------------------------------------------------------------------------------
/megatron/mpu/random.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
16 | # TODO: should be able to get rid of this file entirely
17 | 
18 | import deepspeed
19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
20 | 
21 | # Default name for the model parallel rng tracker.
22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = (
23 |     deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
24 | )
25 | 
26 | # Whether apply model parallelsim to checkpointed hidden states.
27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
28 | 
29 | # RNG tracker object.
30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
31 | 
32 | # Deepspeed checkpointing functions
33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones
34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state
35 | checkpoint = checkpointing.checkpoint
36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
38 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | 
 3 | Tests use pytests with coverage and forked plugins. Install with:
 4 | 
 5 | ```bash
 6 | pip install -r requirements/requirements-dev.txt
 7 | ```
 8 | 
 9 | Download the required test data
10 | ```bash
11 | python prepare_data.py
12 | ```
13 | 
14 | # Run
15 | 
16 | Tests can be run using pytest.
17 | 
18 | * The argument --forked needs to be provided
19 | * A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation)
20 | * A subset of tests can be selected by pointing to the module within tests
21 | 
22 | ```bash
23 | # run all tests, output coverage report of megatron module in terminal
24 | pytest --forked --cov-report term --cov=megatron tests
25 | 
26 | # run tests in tests/model, output coverage report of megatron module as html
27 | pytest --forked --cov-report html --cov=megatron tests/model
28 | 
29 | # run tests in tests/model/test_model_generation.py, don't output coverage report
30 | pytest --forked tests/model/test_model_generation.py
31 | ```
32 | 
33 | Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
34 | The test cases for cpu can be run with:
35 | ````
36 | pytest tests -m cpu
37 | ```
38 | 
39 | If a html coverage report has been created a simple http server can be run to serve static files.
40 | 
41 | ```bash
42 | python -m http.server --directory htmlcov 8000
43 | ```
44 | 
45 | 
46 | ## Tips and Tricks
47 | if You see this kind of error:
48 | ```
49 | RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
50 | ```
51 | It means that you used some pytorch.cuda function before the test creates the processes.
52 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import pathlib
17 | import subprocess
18 | 
19 | from torch.utils import cpp_extension
20 | from pathlib import Path
21 | 
22 | srcpath = Path(__file__).parent.absolute()
23 | 
24 | # Setting this param to a list has a problem of generating different
25 | # compilation commands (with different order of architectures) and
26 | # leading to recompilation of fused kernels. Set it to empty string
27 | # to avoid recompilation and assign arch flags explicitly in
28 | # extra_cuda_cflags below
29 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
30 | 
31 | 
32 | def load_fused_kernels():
33 |     try:
34 |         import scaled_upper_triang_masked_softmax_cuda
35 |         import scaled_masked_softmax_cuda
36 |     except (ImportError, ModuleNotFoundError):
37 |         print("\n")
38 |         print("=" * 100)
39 |         print(
40 |             f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them'
41 |         )
42 |         print("=" * 100)
43 |         exit()
44 |     return
45 | 


--------------------------------------------------------------------------------
/scripts/train_1B_exp3_mixed_minibatches.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_ods_mixed_minibatches_original_weights_init"
 6 | DATA_SAMPLING_METHOD="exp3"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | MIXED_MINIBATCHES=true
10 | 
11 | # SEEDS=(1234 42 100 222)
12 | SEEDS=(42)
13 | 
14 | # RUN SPECIFIC CONFIGS
15 | for SEED in ${SEEDS[@]}; do
16 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
17 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES}"
18 |     python3 tools/create_run_specific_config.py ${ARGS}
19 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
20 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
21 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
22 | 
23 |     # evaluate 0-shot
24 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log
25 |     # evaluate 5-shot
26 |     NUM_SHOTS=5
27 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml ${NUM_SHOTS} 2>&1 | tee outputs/${RUN_NAME}_${NUM_SHOTS}shot_eval.log
28 | done


--------------------------------------------------------------------------------
/scripts/train_1B_ema.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_ods_smoothed_mean_mixed_minibatches_original_weights_init"
 6 | DATA_SAMPLING_METHOD="smoothed_mean"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | MIXED_MINIBATCHES=true
10 | 
11 | # SEEDS=(1234 42 100 222)
12 | SEEDS=(42)
13 | 
14 | # RUN SPECIFIC CONFIGS
15 | for SEED in ${SEEDS[@]}; do
16 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
17 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES}"
18 |     python3 tools/create_run_specific_config.py ${ARGS}
19 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
20 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
21 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
22 |     
23 |     # evaluate 0-shot
24 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log
25 |     # evaluate 5-shot
26 |     NUM_SHOTS=5
27 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml ${NUM_SHOTS} 2>&1 | tee outputs/${RUN_NAME}_${NUM_SHOTS}shot_eval.log
28 | done


--------------------------------------------------------------------------------
/tools/upload.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import sys
17 | 
18 | from huggingface_hub import HfApi, create_repo
19 | 
20 | converted_ckpt = sys.argv[1]
21 | repo_name = sys.argv[2]
22 | branch_name = sys.argv[3]
23 | try:
24 |     create_repo(repo_name, repo_type="model", private=False)
25 | except:
26 |     print("repo {repo_name} already exists!")
27 |     pass
28 | 
29 | files = os.listdir(converted_ckpt)
30 | 
31 | api = HfApi()
32 | if branch_name != "main":
33 |     try:
34 |         api.create_branch(
35 |             repo_id=repo_name,
36 |             repo_type="model",
37 |             branch=branch_name,
38 |         )
39 |     except:
40 |         print(f"branch {branch_name} already exists, try again...")
41 | print(f"to upload: {files}")
42 | for file in files:
43 |     print(f"Uploading {file} to branch {branch_name}...")
44 |     api.upload_file(
45 |         path_or_fileobj=os.path.join(converted_ckpt, file),
46 |         path_in_repo=file,
47 |         repo_id=repo_name,
48 |         repo_type="model",
49 |         commit_message=f"Upload {file}",
50 |         revision=branch_name,
51 |     )
52 |     print(f"Successfully uploaded {file} !")


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # YAML 1.2
 2 | ---
 3 | authors:
 4 |   - affiliation: EleutherAI
 5 |     family-names: Andonian
 6 |     given-names: Alex
 7 |   - affiliation: EleutherAI
 8 |     family-names: Biderman
 9 |     given-names: Stella
10 |   - affiliation: EleutherAI
11 |     family-names: Black
12 |     given-names: Sid
13 |   - affiliation: EleutherAI
14 |     family-names: Gali
15 |     given-names: Preetham
16 |   - affiliation: EleutherAI
17 |     family-names: Gao
18 |     given-names: Leo
19 |   - affiliation: EleutherAI
20 |     family-names: Hallahan
21 |     given-names: Eric
22 |   - affiliation: EleutherAI
23 |     family-names: Levy-Kramer
24 |     given-names: Josh
25 |   - affiliation: EleutherAI
26 |     family-names: Leahy
27 |     given-names: Connor
28 |   - affiliation: EleutherAI
29 |     family-names: Nestler
30 |     given-names: Lucas
31 |   - affiliation: EleutherAI
32 |     family-names: Parker
33 |     given-names: Kip
34 |   - affiliation: EleutherAI
35 |     family-names: Pieler
36 |     given-names: Michael
37 |   - affiliation: EleutherAI
38 |     family-names: Purohit
39 |     given-names: Shivanshu
40 |   - affiliation: EleutherAI
41 |     family-names: Songz
42 |     given-names: Tri
43 |   - affiliation: EleutherAI
44 |     family-names: Phil
45 |     given-names: Wang
46 |   - affiliation: EleutherAI
47 |     family-names: Weinbach
48 |     given-names: Samuel
49 | cff-version: "1.1.0"
50 | keywords:
51 |   - "Transformers"
52 |   - "Massive language model"
53 |   - "Autoregressive language model"
54 | license: "Apache-2.0"
55 | message: "If you use this software, please cite it using these metadata."
56 | repository-code: "https://www.github.com/eleutherai/gpt-neox"
57 | title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch"
58 | version: "0.0.1"
59 | doi: "10.5281/zenodo.5879544"
60 | date-released: 2021-08-23
61 | ...
62 | 


--------------------------------------------------------------------------------
/tools/create_eval_config.py:
--------------------------------------------------------------------------------
 1 | # Simple utility that takes an existing yml config file and adds evaluation specific parameters.
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | def get_step_from_path(path):
 8 |     top_dir = path.split("/")[-1]
 9 |     if "global_step" in top_dir:
10 |         return int(top_dir.split("global_step")[-1])
11 |     else:
12 |         return get_step_from_path("/".join(path.split("/")[:-1])) 
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument("--config_path", type=str, required=True, help="Path to the config file to modify.")
16 | parser.add_argument("--num_fewshot", type=int, required=False, default=0, help="Flag fo the number of fewshot in-context examples to use. 0 if none.")
17 | parser.add_argument("--iteration", type=int, required=False, default=None, help="Iteration of the model to evaluate. If not specified, will use the latest checkpoint.")
18 | 
19 | args = parser.parse_args()
20 | model_step = get_step_from_path(args.config_path)
21 | 
22 | with open(args.config_path, "r") as f:
23 |     config = json.load(f)
24 | 
25 | if "seed" not in config.keys():
26 |     config["seed"] = 1234
27 | 
28 | config["wandb_run_name"] = f"seed{config['seed']}_eval"
29 | 
30 | eval_results_prefix = os.path.join(config["save"], f"step{model_step}")
31 | config['load'] = config['save']
32 | config['eval_results_prefix'] = eval_results_prefix
33 | 
34 | save_path = args.config_path.replace(".yml", "_eval.yml")
35 | 
36 | # if using iteration
37 | if args.iteration is not None:
38 |     config["iteration"] = args.iteration
39 | 
40 | # if using num_fewshot
41 | if args.num_fewshot > 0:
42 |     config["eval_num_fewshot"] = args.num_fewshot
43 |     config["wandb_run_name"] += f"_{args.num_fewshot}shot"
44 |     save_path = save_path.replace(".yml", f"_{args.num_fewshot}shot.yml")
45 | 
46 | with open(save_path, "w") as f:
47 |     json.dump(config, f, indent=2)


--------------------------------------------------------------------------------
/megatron/neox_arguments/template.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | import logging
17 | 
18 | 
19 | @dataclass
20 | class NeoXArgsTemplate:
21 |     def defaults(self):
22 |         """
23 |         generator for getting default values.
24 |         """
25 |         for key, field_def in self.__dataclass_fields__.items():
26 |             yield key, field_def.default
27 | 
28 |     def update_value(self, key: str, value):
29 |         """
30 |         updates a property value if the key already exists
31 | 
32 |         Problem: a previously non-existing property can be added to the class instance without error.
33 |         """
34 |         if hasattr(self, key):
35 |             setattr(self, key, value)
36 |         else:
37 |             error_message = (
38 |                 self.__class__.__name__
39 |                 + ".update_value() to be updated property "
40 |                 + str(key)
41 |                 + " does not exist"
42 |             )
43 |             logging.error(error_message)
44 |             raise ValueError(error_message)
45 | 
46 |     def update_values(self, d):
47 |         """
48 |         Updates multiple values in self if the keys already exists
49 |         """
50 |         for k, v in d.items():
51 |             self.update_value(k, v)
52 | 


--------------------------------------------------------------------------------
/configs/slurm_125M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 |    "num-layers": 12,
 5 |    "hidden-size": 768,
 6 |    "num-attention-heads": 12,
 7 |    "seq-length": 2048,
 8 |    "max-position-embeddings": 2048,
 9 |    "norm": "layernorm",
10 |    "pos-emb": "rotary",
11 |    "no-weight-tying": true,
12 |    "scaled-upper-triang-masked-softmax-fusion": true,
13 |    "bias-gelu-fusion": true,
14 |    "optimizer": {
15 |      "type": "Adam",
16 |      "params": {
17 |        "lr": 0.0006,
18 |        "betas": [0.9, 0.999],
19 |        "eps": 1.0e-8
20 |      }
21 |    },
22 |    "zero_optimization": {
23 |     "stage": 0,
24 |     "allgather_partitions": true,
25 |     "allgather_bucket_size": 500000000,
26 |     "overlap_comm": true,
27 |     "reduce_scatter": true,
28 |     "reduce_bucket_size": 500000000,
29 |     "contiguous_gradients": true
30 |   },
31 |    "train_micro_batch_size_per_gpu": 4,
32 |    "data-impl": "mmap",
33 |    "split": "949,50,1",
34 |    "checkpoint-activations": true,
35 |    "checkpoint-num-layers": 1,
36 |    "partition-activations": true,
37 |    "synchronize-each-layer": true,
38 |    "gradient_clipping": 1.0,
39 |    "weight-decay": 0.0,
40 |    "hidden-dropout": 0.0,
41 |    "attention-dropout": 0.0,
42 |    "fp16": {
43 |      "enabled": true,
44 |      "loss_scale": 0,
45 |      "loss_scale_window": 1000,
46 |      "hysteresis": 2,
47 |      "min_loss_scale": 1
48 |    },
49 |    "train-iters": 320000,
50 |    "lr-decay-iters": 320000,
51 |    "distributed-backend": "nccl",
52 |    "lr-decay-style": "cosine",
53 |    "warmup": 0.01,
54 |    "checkpoint-factor": 10000,
55 |    "eval-interval": 1000,
56 |    "eval-iters": 10,
57 |    "log-interval": 100,
58 |    "steps_per_print": 10,
59 |    "keep-last-n-checkpoints": 4,
60 |    "wall_clock_breakdown": true,
61 |    "launcher": "slurm",
62 |    "deepspeed_slurm": true,
63 |    "comment": "neox"
64 | }
65 | 


--------------------------------------------------------------------------------
/scripts/train_1B_ema_0.5smoothing.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing"
 6 | DATA_SAMPLING_METHOD="smoothed_mean"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | MIXED_MINIBATCHES=true
10 | SMOOTHING_FACTOR="0.5"
11 | 
12 | # SEEDS=(1234 42 100 222)
13 | SEEDS=(42)
14 | 
15 | # RUN SPECIFIC CONFIGS
16 | for SEED in ${SEEDS[@]}; do
17 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
18 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}"
19 |     python3 tools/create_run_specific_config.py ${ARGS}
20 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
21 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
22 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
23 |     
24 |     # evaluate 0-shot
25 |     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
26 |     # evaluate 1-shot through 5-shot
27 |     for i in {1..5}; do
28 |         bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
29 |     done
30 |     wait < <(jobs -p)
31 | done


--------------------------------------------------------------------------------
/scripts/train_3B_ema_0.5smoothing.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/3B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="3B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing"
 6 | DATA_SAMPLING_METHOD="smoothed_mean"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | MIXED_MINIBATCHES=true
10 | SMOOTHING_FACTOR="0.5"
11 | 
12 | # SEEDS=(1234 42 100 222)
13 | SEEDS=(42)
14 | 
15 | # RUN SPECIFIC CONFIGS
16 | for SEED in ${SEEDS[@]}; do
17 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
18 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}"
19 |     python3 tools/create_run_specific_config.py ${ARGS}
20 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
21 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
22 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
23 |     
24 |     # # evaluate 0-shot
25 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
26 |     # # evaluate 1-shot through 5-shot
27 |     # for i in {1..5}; do
28 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
29 |     # done
30 |     # wait < <(jobs -p)
31 | done


--------------------------------------------------------------------------------
/scripts/train_1B_ema_0.5smoothing_150B_tokens.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_150B_tokens.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing_150B_tokens"
 6 | DATA_SAMPLING_METHOD="smoothed_mean"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | MIXED_MINIBATCHES=true
10 | SMOOTHING_FACTOR="0.5"
11 | 
12 | # SEEDS=(1234 42 100 222)
13 | SEEDS=(42)
14 | 
15 | # RUN SPECIFIC CONFIGS
16 | for SEED in ${SEEDS[@]}; do
17 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
18 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}"
19 |     python3 tools/create_run_specific_config.py ${ARGS}
20 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
21 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
22 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
23 |     
24 |     # # evaluate 0-shot
25 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
26 |     # # evaluate 1-shot through 5-shot
27 |     # for i in {1..5}; do
28 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
29 |     # done
30 |     # wait < <(jobs -p)
31 | done


--------------------------------------------------------------------------------
/scripts/train_1B_seqlen2048_ema_0.5smoothing.sh:
--------------------------------------------------------------------------------
 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS
 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml"
 3 | # Doesn't include alon_configs/parallelism.yml
 4 | 
 5 | WANDB_GROUP="1B_seqlen2048_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing"
 6 | DATA_SAMPLING_METHOD="smoothed_mean"
 7 | DATA_SAMPLING_WARMUP_STEPS="2000"
 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1"
 9 | MIXED_MINIBATCHES=true
10 | SMOOTHING_FACTOR="0.5"
11 | 
12 | # SEEDS=(1234 42 100 222)
13 | SEEDS=(42)
14 | 
15 | # RUN SPECIFIC CONFIGS
16 | for SEED in ${SEEDS[@]}; do
17 |     RUN_NAME="${WANDB_GROUP}_seed${SEED}"
18 |     ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}"
19 |     python3 tools/create_run_specific_config.py ${ARGS}
20 |     RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml"
21 |     echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}"
22 |     python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log
23 |     
24 |     # # evaluate 0-shot
25 |     # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log &
26 |     # # evaluate 1-shot through 5-shot
27 |     # for i in {1..5}; do
28 |     #     bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log &
29 |     # done
30 |     # wait < <(jobs -p)
31 | done


--------------------------------------------------------------------------------
/configs/125M-json.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe-parallel-size": 1,
 3 |   "model-parallel-size": 1,
 4 | 
 5 |   "num-layers": 12,
 6 |   "hidden-size": 768,
 7 |   "num-attention-heads": 12,
 8 |   "seq-length": 2048,
 9 |   "max-position-embeddings": 2048,
10 |   "norm": "layernorm",
11 |   "pos-emb": "rotary",
12 |   "no-weight-tying": true,
13 |   "gpt_j_residual": false,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": false,
17 |   "bias-gelu-fusion": false,
18 | 
19 |   "init_method": "small_init",
20 |   "output_layer_init_method": "wang_init",
21 | 
22 |   "optimizer": {
23 |     "type": "Adam",
24 |     "params": {
25 |       "lr": 0.0006,
26 |       "betas": [0.9, 0.95],
27 |       "eps": 1.0e-8
28 |     }
29 |   },
30 |   "min_lr": 0.00006,
31 | 
32 |   "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": true
40 |   },
41 | 
42 |   "train_micro_batch_size_per_gpu": 4,
43 |   "data-impl": "mmap",
44 | 
45 |   "checkpoint-activations": true,
46 |   "checkpoint-num-layers": 1,
47 |   "partition-activations": true,
48 |   "synchronize-each-layer": true,
49 | 
50 |   "gradient_clipping": 1.0,
51 |   "weight-decay": 0.1,
52 |   "hidden-dropout": 0.0,
53 |   "attention-dropout": 0.0,
54 | 
55 |   "fp16": {
56 |     "enabled": true,
57 |     "loss_scale": 0,
58 |     "loss_scale_window": 1000,
59 |     "hysteresis": 2,
60 |     "min_loss_scale": 1
61 |   },
62 | 
63 |   "train-iters": 320000,
64 |   "lr-decay-iters": 320000,
65 |   "distributed-backend": "nccl",
66 |   "lr-decay-style": "cosine",
67 |   "warmup": 0.01,
68 |   "checkpoint-factor": 10000,
69 |   "eval-interval": 1000,
70 |   "eval-iters": 10,
71 | 
72 |   "log-interval": 100,
73 |   "steps_per_print": 10,
74 |   "keep-last-n-checkpoints": 4,
75 |   "wall_clock_breakdown": true,
76 | 
77 |   "hostfile": "/mock_path"
78 | }
79 | 


--------------------------------------------------------------------------------
/scripts/evaluate.sh:
--------------------------------------------------------------------------------
 1 | MODEL_CONFIG=$1
 2 | EVAL_CONFIG=$2
 3 | STEP=$3
 4 | NUM_FEWSHOT=${4:-0}
 5 | GPU=${5:-0}
 6 | 
 7 | # Can't use perplexity-based evaluation tasks with in-context examples
 8 | if [ ${NUM_FEWSHOT} -eq 0 ]; then
 9 |     EVAL_TASKS="lambada_openai piqa winogrande wsc arc_easy sciq logiqa wikitext openbookqa hendrycksTest-*"
10 | else
11 |     # if using few-shot, then we can't use wikitext
12 |     EVAL_TASKS="lambada_openai piqa winogrande wsc arc_easy sciq logiqa openbookqa hendrycksTest-*"
13 | fi
14 | # Temporarily not using triviaqa because it can't download?
15 | # Not using:
16 | #   webqs (web questions) because our models have very poor performance (0.005 accuracy)
17 | #   squad2 because it leads to: AttributeError: 'SequentialWrapper' object has no attribute 'clear_cache' 
18 | 
19 | # MODEL_CONFIG should be from the output of the training script
20 | #   For example, gpt-neox/outputs/160m_doremi_280_seed42/global_step100000/configs/160m_doremi_280_seed42.yml
21 | #       It should have additional fields. Eg.:
22 | #         "load": "outputs/160m_doremi_280_seed42",
23 | #         "eval_results_prefix": "outputs/160m_doremi_280_seed42",
24 | # CONFIGS="outputs/160m_doremi_280_seed42/global_step100000/configs/160m_doremi_280_seed42.yml alon_configs/models/eval_160m_1gpu.yml"
25 | python3 tools/create_eval_config.py --config_path ${MODEL_CONFIG} --num_fewshot ${NUM_FEWSHOT} --iteration ${STEP}
26 | 
27 | # if not using num_fewshot, then you can just use the following:
28 | if [ ${NUM_FEWSHOT} -eq 0 ]; then
29 |     # EVAL_MODEL_CONFIG is MODEL_CONFIG with .yml replaced by _eval.yml
30 |     EVAL_MODEL_CONFIG=${MODEL_CONFIG%.yml}_eval.yml
31 | else
32 |     EVAL_MODEL_CONFIG=${MODEL_CONFIG%.yml}_eval_${NUM_FEWSHOT}shot.yml
33 | fi
34 | 
35 | # Get GPU Config
36 | GPU_CONFIG=alon_configs/gpu/gpu${GPU}.yml
37 | 
38 | 
39 | # EVAL_CONFIG should be in the configs folder. See alon_configs/models/eval_160m_1gpu.yml for an example
40 | python ./deepy.py evaluate.py ${EVAL_MODEL_CONFIG} ${EVAL_CONFIG} ${GPU_CONFIG} --eval_tasks ${EVAL_TASKS}


--------------------------------------------------------------------------------
/configs/gmlp_small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 |    "attention_config": [[["gmlp"], "all"]],
 8 | 
 9 | 
10 |    # model settings
11 |    "num-layers": 12,
12 |    "hidden-size": 768, # gmlp d_ff defaults to hidden_size * 4
13 |    "gmlp_attn_dim": 64,
14 |    "num-attention-heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
15 |    "seq-length": 2048,
16 |    "max-position-embeddings": 2048,
17 |    "norm": "layernorm",
18 |    "pos-emb": "none",
19 |    "no-weight-tying": true,
20 | 
21 |    # optimizer settings
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.0006,
26 |        "betas": [0.9, 0.999],
27 |        "eps": 1.0e-8,
28 |      }
29 |    },
30 | 
31 |    # batch / data settings
32 |    "train_micro_batch_size_per_gpu": 4,
33 |    "data-impl": "mmap",
34 |    "split": "949,50,1",
35 | 
36 |    # activation checkpointing
37 |    "checkpoint-activations": true,
38 |    "checkpoint-num-layers": 1,
39 |    "partition-activations": false,
40 |    "synchronize-each-layer": true,
41 | 
42 |    # regularization
43 |    "gradient_clipping": 1.0,
44 |    "weight-decay": 0.1,
45 |    "hidden-dropout": 0.0,
46 |    "attention-dropout": 0.0,
47 | 
48 |    # precision settings
49 |    "fp16": {
50 |      "enabled": true,
51 |      "loss_scale": 0,
52 |      "loss_scale_window": 1000,
53 |      "hysteresis": 2,
54 |      "min_loss_scale": 1
55 |    },
56 | 
57 |    # misc. training settings
58 |    "train-iters": 320000,
59 |    "lr-decay-iters": 320000,
60 |    "distributed-backend": "nccl",
61 |    "lr-decay-style": "cosine",
62 |    "warmup": 0.01,
63 |    "checkpoint-factor": 10000,
64 |    "eval-interval": 1000,
65 |    "eval-iters": 10,
66 | 
67 |    # logging
68 |    "log-interval": 100,
69 |    "steps_per_print": 10,
70 |    "keep-last-n-checkpoints": 4,
71 |    "wall_clock_breakdown": true,
72 | }
73 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI contributors
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from typing import Optional
19 | from torch import Tensor
20 | 
21 | # flags required to enable jit fusion kernels
22 | torch._C._jit_set_profiling_mode(False)
23 | torch._C._jit_set_profiling_executor(False)
24 | torch._C._jit_override_can_fuse_on_cpu(True)
25 | torch._C._jit_override_can_fuse_on_gpu(True)
26 | 
27 | 
28 | def bias_dropout_add(
29 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool
30 | ) -> Tensor:
31 |     out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
32 |     if residual is not None:
33 |         out = residual + out
34 |     return out
35 | 
36 | 
37 | def get_bias_dropout_add(training):
38 |     def _bias_dropout_add(x, bias, residual, prob):
39 |         return bias_dropout_add(x, bias, residual, prob, training)
40 | 
41 |     return _bias_dropout_add
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_fused_train(
46 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add(x, bias, residual, prob, True)
49 | 
50 | 
51 | @torch.jit.script
52 | def bias_dropout_add_fused_inference(
53 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
54 | ) -> Tensor:
55 |     return bias_dropout_add(x, bias, residual, prob, False)
56 | 


--------------------------------------------------------------------------------
/configs/19M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe-parallel-size": 1,
 3 |   "model-parallel-size": 1,
 4 | 
 5 |   # model settings
 6 |   "num-layers": 6,
 7 |   "hidden-size": 512,
 8 |   "num-attention-heads": 8,
 9 |   "seq-length": 2048,
10 |   "max-position-embeddings": 2048,
11 |   "pos-emb": "rotary",
12 |   "no-weight-tying": true,
13 |   "gpt-j-residual": false,
14 |   "output-layer-parallelism": "column",
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": false,
17 |   "bias-gelu-fusion": false,
18 | 
19 |   # init methods
20 |   "init_method": "small_init",
21 |   "output_layer_init_method": "wang_init",
22 | 
23 |   "optimizer": {
24 |     "type": "Adam",
25 |     "params": {
26 |       "lr": 0.001,
27 |       "betas": [0.9, 0.95],
28 |       "eps": 1.0e-8,
29 |     }
30 |   },
31 |   "min_lr": 0.0001,
32 | 
33 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": True,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": True,
39 |     "reduce_scatter": True,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": True,
42 |   },
43 | 
44 |   "train_micro_batch_size_per_gpu": 4, #32,
45 |   "gas": 1,
46 |   "data-impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   # activation checkpointing
50 |   "checkpoint-activations": true,
51 |   "checkpoint-num-layers": 1,
52 |   "partition-activations": true,
53 |   "synchronize-each-layer": true,
54 | 
55 |   # regularization
56 |   "gradient_clipping": 1.0,
57 |   "weight-decay": 0.1,
58 |   "hidden-dropout": 0,
59 |   "attention-dropout": 0,
60 | 
61 |   # precision settings
62 |   "fp16": {
63 |     "fp16": true,
64 |     "enabled": true,
65 |     "loss_scale": 0,
66 |     "loss_scale_window": 1000,
67 |     "initial_scale_power": 12,
68 |     "hysteresis": 2,
69 |     "min_loss_scale": 1,
70 |   },
71 | 
72 |   "train-iters": 143000,
73 |   "lr-decay-iters": 143000,
74 |   "distributed-backend": "nccl",
75 |   "lr-decay-style": "cosine",
76 |   "warmup": 0.01,
77 |   "checkpoint-factor": 1000,
78 |   "eval-interval": 100000,
79 |   "eval-iters": 10,
80 | 
81 |   "log-interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/configs/800M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe-parallel-size": 1,
 3 |   "model-parallel-size": 1,
 4 | 
 5 |   # model settings
 6 |   "num-layers": 16,
 7 |   "hidden-size": 2048,
 8 |   "num-attention-heads": 8,
 9 |   "seq-length": 2048,
10 |   "max-position-embeddings": 2048,
11 |   "pos-emb": "rotary",
12 |   "no-weight-tying": true,
13 |   "gpt-j-residual": false,
14 |   "output-layer-parallelism": "column",
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": false,
17 |   "bias-gelu-fusion": false,
18 | 
19 |   # init methods
20 |   "init_method": "small_init",
21 |   "output_layer_init_method": "wang_init",
22 | 
23 |   "optimizer": {
24 |     "type": "Adam",
25 |     "params": {
26 |       "lr": 0.00025,
27 |       "betas": [0.9, 0.95],
28 |       "eps": 1.0e-8,
29 |     }
30 |   },
31 |   "min_lr": 0.000025,
32 | 
33 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": True,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": True,
39 |     "reduce_scatter": True,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": True,
42 |   },
43 | 
44 |   "train_micro_batch_size_per_gpu": 16,
45 |   "gas": 1,
46 |   "data-impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   # activation checkpointing
50 |   "checkpoint-activations": true,
51 |   "checkpoint-num-layers": 1,
52 |   "partition-activations": true,
53 |   "synchronize-each-layer": true,
54 | 
55 |   # regularization
56 |   "gradient_clipping": 1.0,
57 |   "weight-decay": 0.1,
58 |   "hidden-dropout": 0,
59 |   "attention-dropout": 0,
60 | 
61 |   # precision settings
62 |   "fp16": {
63 |     "fp16": true,
64 |     "enabled": true,
65 |     "loss_scale": 0,
66 |     "loss_scale_window": 1000,
67 |     "initial_scale_power": 12,
68 |     "hysteresis": 2,
69 |     "min_loss_scale": 1,
70 |   },
71 | 
72 |   "train-iters": 143000,
73 |   "lr-decay-iters": 143000,
74 |   "distributed-backend": "nccl",
75 |   "lr-decay-style": "cosine",
76 |   "warmup": 0.01,
77 |   "checkpoint-factor": 1000,
78 |   "eval-interval": 40000,
79 |   "eval-iters": 10,
80 | 
81 |   "log-interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from tools.corpora import prepare_dataset, DATA_DOWNLOADERS
16 | import argparse
17 | 
18 | TOKENIZER_CHOICES = [
19 |     "HFGPT2Tokenizer",
20 |     "HFTokenizer",
21 |     "GPT2BPETokenizer",
22 |     "CharLevelTokenizer",
23 |     "TiktokenTokenizer",
24 | ]
25 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"]
26 | 
27 | 
28 | def get_args():
29 |     parser = argparse.ArgumentParser(description="Download & preprocess neox datasets")
30 |     parser.add_argument(
31 |         "dataset",
32 |         nargs="?",
33 |         default="enron",
34 |         help="name of dataset to download.",
35 |         choices=DATASET_CHOICES,
36 |     )
37 |     parser.add_argument(
38 |         "-t",
39 |         "--tokenizer",
40 |         default="GPT2BPETokenizer",
41 |         choices=TOKENIZER_CHOICES,
42 |         help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}',
43 |     )
44 |     parser.add_argument(
45 |         "-d",
46 |         "--data-dir",
47 |         default=None,
48 |         help=f"Directory to which to download datasets / tokenizer "
49 |         f"files - defaults to ./data",
50 |     )
51 |     parser.add_argument(
52 |         "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)"
53 |     )
54 |     parser.add_argument(
55 |         "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)"
56 |     )
57 |     return parser.parse_args()
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     args = get_args()
62 |     prepare_dataset(
63 |         dataset_name=args.dataset,
64 |         tokenizer_type=args.tokenizer,
65 |         data_dir=args.data_dir,
66 |         vocab_file=args.vocab_file,
67 |         merge_file=args.merge_file,
68 |     )
69 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from torch.utils import cpp_extension
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | from pathlib import Path
 5 | import subprocess
 6 | 
 7 | 
 8 | def _get_cuda_bare_metal_version(cuda_dir):
 9 |     raw_output = subprocess.check_output(
10 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
11 |     )
12 |     output = raw_output.split()
13 |     release_idx = output.index("release") + 1
14 |     release = output[release_idx].split(".")
15 |     bare_metal_major = release[0]
16 |     bare_metal_minor = release[1][0]
17 | 
18 |     return raw_output, bare_metal_major, bare_metal_minor
19 | 
20 | 
21 | srcpath = Path(__file__).parent.absolute()
22 | cc_flag = []
23 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
24 | if int(bare_metal_major) >= 11:
25 |     cc_flag.append("-gencode")
26 |     cc_flag.append("arch=compute_80,code=sm_80")
27 | 
28 | nvcc_flags = [
29 |     "-O3",
30 |     "-gencode",
31 |     "arch=compute_70,code=sm_70",
32 |     "--use_fast_math",
33 |     "-U__CUDA_NO_HALF_OPERATORS__",
34 |     "-U__CUDA_NO_HALF_CONVERSIONS__",
35 |     "--expt-relaxed-constexpr",
36 |     "--expt-extended-lambda",
37 | ]
38 | cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag}
39 | layernorm_cuda_args = {
40 |     "cxx": ["-O3"],
41 |     "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"],
42 | }
43 | setup(
44 |     name="fused_kernels",
45 |     version="0.0.1",
46 |     author="Sid Black & Alejandro Molina et al.",
47 |     author_email="alejandro.molina@aleph-alpha.de",
48 |     include_package_data=False,
49 |     ext_modules=[
50 |         CUDAExtension(
51 |             "scaled_upper_triang_masked_softmax_cuda",
52 |             [
53 |                 str(srcpath / "scaled_upper_triang_masked_softmax.cpp"),
54 |                 str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"),
55 |             ],
56 |             extra_compile_args=cuda_ext_args,
57 |         ),
58 |         CUDAExtension(
59 |             "scaled_masked_softmax_cuda",
60 |             [
61 |                 str(srcpath / "scaled_masked_softmax.cpp"),
62 |                 str(srcpath / "scaled_masked_softmax_cuda.cu"),
63 |             ],
64 |             extra_compile_args=cuda_ext_args,
65 |         ),
66 |     ],
67 |     cmdclass={"build_ext": BuildExtension},
68 | )
69 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Model parallel utility interface."""
16 | 
17 | from .cross_entropy import vocab_parallel_cross_entropy
18 | 
19 | from .data import broadcast_data
20 | 
21 | from .initialize import is_unitialized
22 | from .initialize import destroy_model_parallel
23 | from .initialize import get_data_parallel_group
24 | from .initialize import get_data_parallel_rank
25 | from .initialize import get_data_parallel_world_size
26 | from .initialize import get_model_parallel_group
27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
30 | from .initialize import get_topology
31 | from .initialize import get_pipe_parallel_group
32 | from .initialize import get_pipe_parallel_rank
33 | from .initialize import get_pipe_parallel_world_size
34 | from .initialize import get_io_parallel_group
35 | from .initialize import initialize_model_parallel
36 | from .initialize import model_parallel_is_initialized
37 | 
38 | from .layers import ColumnParallelLinear
39 | from .layers import RowParallelLinear
40 | from .layers import VocabParallelEmbedding
41 | from .layers import ParallelRelativePositionBias
42 | 
43 | from .mappings import copy_to_model_parallel_region
44 | from .mappings import gather_from_model_parallel_region
45 | from .mappings import reduce_from_model_parallel_region
46 | from .mappings import scatter_to_model_parallel_region
47 | 
48 | from .random import checkpoint
49 | from .random import get_cuda_rng_tracker
50 | from .random import model_parallel_cuda_manual_seed
51 | 
52 | from .utils import divide
53 | from .utils import split_tensor_along_last_dim
54 | 


--------------------------------------------------------------------------------
/configs/49M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # parallelism settings
 3 |   "pipe-parallel-size": 1,
 4 |   "model-parallel-size": 1,
 5 | 
 6 |   # model settings
 7 |   "num-layers": 10,
 8 |   "hidden-size": 640,
 9 |   "num-attention-heads": 10,
10 |   "seq-length": 2048,
11 |   "max-position-embeddings": 2048,
12 |   "pos-emb": "rotary",
13 |   "rotary-pct": 0.25,
14 |   "no-weight-tying": true,
15 |   "gpt-j-residual": true,
16 |   "output-layer-parallelism": "column",
17 | 
18 |   # these should provide some speedup but takes a while to build, set to true if desired
19 |   "scaled-upper-triang-masked-softmax-fusion": false,
20 |   "bias-gelu-fusion": false,
21 | 
22 |   # init methods
23 |   "init_method": "small_init",
24 |   "output_layer_init_method": "wang_init",
25 | 
26 |   # optimizer settings
27 |   "optimizer": {
28 |     "type": "Adam",
29 |     "params": {
30 |       "lr": 0.0008,
31 |       "betas": [0.9, 0.95],
32 |       "eps": 1.0e-8,
33 |     }
34 |   },
35 |   "min_lr": 0.00008,
36 | 
37 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
38 |   "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": True,
41 |     "allgather_bucket_size": 500000000,
42 |     "overlap_comm": True,
43 |     "reduce_scatter": True,
44 |     "reduce_bucket_size": 500000000,
45 |     "contiguous_gradients": True,
46 |   },
47 | 
48 |   # batch / data settings
49 |   "train_micro_batch_size_per_gpu": 32,
50 |   "gas": 1,
51 |   "data-impl": "mmap",
52 |   "num_workers": 1,
53 | 
54 |   # activation checkpointing
55 |   "checkpoint-activations": true,
56 |   "checkpoint-num-layers": 1,
57 |   "partition-activations": true,
58 |   "synchronize-each-layer": true,
59 | 
60 |   # regularization
61 |   "gradient_clipping": 1.0,
62 |   "weight-decay": 0.1,
63 |   "hidden-dropout": 0,
64 |   "attention-dropout": 0,
65 | 
66 |   # precision settings
67 |   "fp16": {
68 |     "fp16": true,
69 |     "enabled": true,
70 |     "loss_scale": 0,
71 |     "loss_scale_window": 1000,
72 |     "initial_scale_power": 12,
73 |     "hysteresis": 2,
74 |     "min_loss_scale": 1,
75 |   },
76 | 
77 |   # misc. training settings
78 |   "train-iters": 143000,
79 |   "lr-decay-iters": 143000,
80 |   "distributed-backend": "nccl",
81 |   "lr-decay-style": "cosine",
82 |   "warmup": 0.01,
83 |   "checkpoint-factor": 1000,
84 |   "eval-interval": 100000,
85 |   "eval-iters": 10,
86 | 
87 |   # logging
88 |   "log-interval": 10,
89 |   "steps_per_print": 10,
90 |   "wall_clock_breakdown": true,
91 | }
92 | 


--------------------------------------------------------------------------------
/configs/bnb_125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "use-bnb-optimizer": true,
18 | 
19 |    # these should provide some speedup but takes a while to build, set to true if desired
20 |    "scaled-upper-triang-masked-softmax-fusion": false,
21 |    "bias-gelu-fusion": false,
22 | 
23 | 
24 |    # optimizer settings
25 |    "optimizer": {
26 |      "type": "Adam",
27 |      "params": {
28 |        "lr": 0.0006,
29 |        "betas": [0.9, 0.999],
30 |        "eps": 1.0e-8,
31 |      }
32 |    },
33 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
34 |    "zero_optimization": {
35 |     "stage": 0,
36 |     "allgather_partitions": True,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": True,
39 |     "reduce_scatter": True,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": True,
42 |   },
43 | 
44 |    # batch / data settings
45 |    "train_micro_batch_size_per_gpu": 4,
46 |    "data-impl": "mmap",
47 |    "split": "949,50,1",
48 | 
49 |    # activation checkpointing
50 |    "checkpoint-activations": true,
51 |    "checkpoint-num-layers": 1,
52 |    "partition-activations": true,
53 |    "synchronize-each-layer": true,
54 | 
55 |    # regularization
56 |    "gradient_clipping": 1.0,
57 |    "weight-decay": 0.0,
58 |    "hidden-dropout": 0.0,
59 |    "attention-dropout": 0.0,
60 | 
61 |    # precision settings
62 |    "fp16": {
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    # misc. training settings
71 |    "train-iters": 320000,
72 |    "lr-decay-iters": 320000,
73 |    "distributed-backend": "nccl",
74 |    "lr-decay-style": "cosine",
75 |    "warmup": 0.01,
76 |    "checkpoint-factor": 10000,
77 |    "eval-interval": 1000,
78 |    "eval-iters": 10,
79 | 
80 |    # logging
81 |    "log-interval": 100,
82 |    "steps_per_print": 10,
83 |    "keep-last-n-checkpoints": 4,
84 |    "wall_clock_breakdown": true,
85 | }
86 | 


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_1gpu.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 10,
20 |   "train_micro_batch_size_per_gpu": 10,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_train_datasets": true,
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_seqlen2048_1gpu.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 2048,
 7 |   "max-position-embeddings": 2048,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 10,
20 |   "train_micro_batch_size_per_gpu": 10,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_train_datasets": true,
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 | }


--------------------------------------------------------------------------------
/alon_configs/models/1B_unnamed_train_datasets.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 480,
20 |   "train_micro_batch_size_per_gpu": 60,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_eval_datasets": true,
94 |   "max_validation_samples_per_dataset": 5000,
95 | 
96 |   # "keep_last_n_checkpoints": 1
97 | 
98 | }


--------------------------------------------------------------------------------
/alon_configs/models/3B_unnamed_train_datasets.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings, 2,846,767,360 parameters
 3 |   "num-layers": 32,
 4 |   "hidden-size": 2560,
 5 |   "num-attention-heads": 32,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "norm": "layernorm",
 9 |   "pos-emb": "rotary",
10 |   "rotary-pct": 0.25,
11 |   "no-weight-tying": true,
12 |   "gpt-j-residual": true,
13 |   "output-layer-parallelism": "column",
14 |   
15 |   "attention-config": [[["flash"], 32]],
16 | 
17 |   "scaled-upper-triang-masked-softmax-fusion": true,
18 |   "bias-gelu-fusion": true,
19 | 
20 |   "train_batch_size": 480,
21 |   "train_micro_batch_size_per_gpu": 10,
22 |   "gradient_accumulation_steps": 6,
23 |   "data-impl": "mmap",
24 |   "num_workers": 1,
25 | 
26 |   # optimizer settings
27 |   "optimizer": {
28 |     "type": "Adam",
29 |     "params": {
30 |       "lr": 0.00016,
31 |       "betas": [0.9, 0.95],
32 |       "eps": 1.0e-8
33 |     }
34 |   },
35 |   "min_lr": 0.000016,
36 | 
37 |   "zero_optimization": {
38 |     "stage": 1,
39 |     "allgather_partitions": true,
40 |     "allgather_bucket_size": 500000000,
41 |     "overlap_comm": true,
42 |     "reduce_scatter": true,
43 |     "reduce_bucket_size": 500000000,
44 |     "contiguous_gradients": true,
45 |     "cpu_offload": false
46 |   },
47 | 
48 |   # activation checkpointing
49 |   "checkpoint-activations": true,
50 |   "checkpoint-num-layers": 1,
51 |   "partition-activations": true,
52 |   "synchronize-each-layer": true,
53 | 
54 |   # regularization
55 |   "gradient_clipping": 1.0,
56 |   "weight-decay": 0.1,
57 |   "hidden-dropout": 0,
58 |   "attention-dropout": 0,
59 | 
60 |   # precision settings
61 |   "fp16": {
62 |     "fp16": true,
63 |     "enabled": true,
64 |     "loss_scale": 0,
65 |     "loss_scale_window": 1000,
66 |     "initial_scale_power": 12,
67 |     "hysteresis": 2,
68 |     "min_loss_scale": 1
69 |   },
70 | 
71 |   # train settings
72 |   "train-iters": 100000,
73 |   "lr-decay-iters": 100000,
74 |   "distributed-backend": "nccl",
75 |   "lr-decay-style": "cosine",
76 |   "warmup": 0.01,
77 |   "checkpoint-factor": 10000,
78 |   "eval-interval": 1000,
79 | 
80 |   # logging
81 |   "log-interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | 
85 |   # tokenizer settings
86 |   "tokenizer-type": "HFTokenizer",
87 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
88 | 
89 |   # wandb settings
90 |   "use_wandb": true,
91 |   "wandb_project": "neox",
92 | 
93 |   # domain specific settings
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 | }


--------------------------------------------------------------------------------
/alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 2048,
 7 |   "max-position-embeddings": 2048,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 240,
20 |   "train_micro_batch_size_per_gpu": 30,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_eval_datasets": true,
94 |   "max_validation_samples_per_dataset": 5000,
95 | 
96 |   # "keep_last_n_checkpoints": 1
97 | 
98 | }


--------------------------------------------------------------------------------
/alon_configs/models/410m.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 24,
 4 |   "hidden-size": 1024,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 | 
14 |   "attention-config": [[["flash"], 24]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 480,
20 |   "train_micro_batch_size_per_gpu": 60,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.0003,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.00003,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_train_datasets": true,
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 |   "keep_last_n_checkpoints": 1
98 | 
99 | }


--------------------------------------------------------------------------------
/alon_configs/models/1B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 480,
20 |   "train_micro_batch_size_per_gpu": 60,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_train_datasets": true,
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 |   # "keep_last_n_checkpoints": 1
98 | 
99 | }


--------------------------------------------------------------------------------
/alon_configs/models/1B_seqlen2048.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 2048,
 7 |   "max-position-embeddings": 2048,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 240,
20 |   "train_micro_batch_size_per_gpu": 30,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 100000,
72 |   "lr-decay-iters": 100000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_train_datasets": true,
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 |   # "keep_last_n_checkpoints": 1
98 | 
99 | }


--------------------------------------------------------------------------------
/alon_configs/models/1B_150B_tokens.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings
 3 |   "num-layers": 16,
 4 |   "hidden-size": 2048,
 5 |   "num-attention-heads": 16,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "pos-emb": "rotary",
 9 |   "rotary-pct": 0.25,
10 |   "no-weight-tying": true,
11 |   "gpt-j-residual": true,
12 |   "output-layer-parallelism": "column",
13 |   
14 |   "attention-config": [[["flash"], 16]],
15 | 
16 |   "scaled-upper-triang-masked-softmax-fusion": true,
17 |   "bias-gelu-fusion": true,
18 | 
19 |   "train_batch_size": 480,
20 |   "train_micro_batch_size_per_gpu": 60,
21 |   "gradient_accumulation_steps": 1,
22 |   "data-impl": "mmap",
23 |   "num_workers": 1,
24 | 
25 |   # optimizer settings
26 |   "optimizer": {
27 |     "type": "Adam",
28 |     "params": {
29 |       "lr": 0.00025,
30 |       "betas": [0.9, 0.95],
31 |       "eps": 1.0e-8
32 |     }
33 |   },
34 |   "min_lr": 0.000025,
35 | 
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 500000000,
40 |     "overlap_comm": true,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 500000000,
43 |     "contiguous_gradients": true,
44 |     "cpu_offload": false
45 |   },
46 | 
47 |   # activation checkpointing
48 |   "checkpoint-activations": true,
49 |   "checkpoint-num-layers": 1,
50 |   "partition-activations": true,
51 |   "synchronize-each-layer": true,
52 | 
53 |   # regularization
54 |   "gradient_clipping": 1.0,
55 |   "weight-decay": 0.1,
56 |   "hidden-dropout": 0,
57 |   "attention-dropout": 0,
58 | 
59 |   # precision settings
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   # train settings
71 |   "train-iters": 300000,
72 |   "lr-decay-iters": 300000,
73 |   "distributed-backend": "nccl",
74 |   "lr-decay-style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint-factor": 10000,
77 |   "eval-interval": 1000,
78 | 
79 |   # logging
80 |   "log-interval": 100,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   # tokenizer settings
85 |   "tokenizer-type": "HFTokenizer",
86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
87 | 
88 |   # wandb settings
89 |   "use_wandb": true,
90 |   "wandb_project": "neox",
91 | 
92 |   # domain specific settings
93 |   "use_named_train_datasets": true,
94 |   "use_named_eval_datasets": true,
95 |   "max_validation_samples_per_dataset": 5000,
96 | 
97 |   # "keep_last_n_checkpoints": 1
98 | 
99 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_3B_1gpu.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings, 2,846,767,360 parameters
 3 |   "num-layers": 32,
 4 |   "hidden-size": 2560,
 5 |   "num-attention-heads": 32,
 6 |   "seq-length": 1024,
 7 |   "max-position-embeddings": 1024,
 8 |   "norm": "layernorm",
 9 |   "pos-emb": "rotary",
10 |   "rotary-pct": 0.25,
11 |   "no-weight-tying": true,
12 |   "gpt-j-residual": true,
13 |   "output-layer-parallelism": "column",
14 |   
15 |   "attention-config": [[["flash"], 32]],
16 | 
17 |   "scaled-upper-triang-masked-softmax-fusion": true,
18 |   "bias-gelu-fusion": true,
19 | 
20 |   "train_batch_size": 10,
21 |   "train_micro_batch_size_per_gpu": 10,
22 |   "gradient_accumulation_steps": 1,
23 |   "data-impl": "mmap",
24 |   "num_workers": 1,
25 | 
26 |   # optimizer settings
27 |   "optimizer": {
28 |     "type": "Adam",
29 |     "params": {
30 |       "lr": 0.00016,
31 |       "betas": [0.9, 0.95],
32 |       "eps": 1.0e-8
33 |     }
34 |   },
35 |   "min_lr": 0.000016,
36 | 
37 |   "zero_optimization": {
38 |     "stage": 1,
39 |     "allgather_partitions": true,
40 |     "allgather_bucket_size": 500000000,
41 |     "overlap_comm": true,
42 |     "reduce_scatter": true,
43 |     "reduce_bucket_size": 500000000,
44 |     "contiguous_gradients": true,
45 |     "cpu_offload": false
46 |   },
47 | 
48 |   # activation checkpointing
49 |   "checkpoint-activations": true,
50 |   "checkpoint-num-layers": 1,
51 |   "partition-activations": true,
52 |   "synchronize-each-layer": true,
53 | 
54 |   # regularization
55 |   "gradient_clipping": 1.0,
56 |   "weight-decay": 0.1,
57 |   "hidden-dropout": 0,
58 |   "attention-dropout": 0,
59 | 
60 |   # precision settings
61 |   "fp16": {
62 |     "fp16": true,
63 |     "enabled": true,
64 |     "loss_scale": 0,
65 |     "loss_scale_window": 1000,
66 |     "initial_scale_power": 12,
67 |     "hysteresis": 2,
68 |     "min_loss_scale": 1
69 |   },
70 | 
71 |   # train settings
72 |   "train-iters": 100000,
73 |   "lr-decay-iters": 100000,
74 |   "distributed-backend": "nccl",
75 |   "lr-decay-style": "cosine",
76 |   "warmup": 0.01,
77 |   "checkpoint-factor": 10000,
78 |   "eval-interval": 1000,
79 | 
80 |   # logging
81 |   "log-interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | 
85 |   # tokenizer settings
86 |   "tokenizer-type": "HFTokenizer",
87 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
88 | 
89 |   # wandb settings
90 |   "use_wandb": true,
91 |   "wandb_project": "neox",
92 | 
93 |   # domain specific settings
94 |   "use_named_train_datasets": true,
95 |   "use_named_eval_datasets": true,
96 |   "max_validation_samples_per_dataset": 5000,
97 | 
98 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_3B_seqlen2048_1gpu.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # model settings, 2,846,767,360 parameters
 3 |   "num-layers": 32,
 4 |   "hidden-size": 2560,
 5 |   "num-attention-heads": 32,
 6 |   "seq-length": 2048,
 7 |   "max-position-embeddings": 2048,
 8 |   "norm": "layernorm",
 9 |   "pos-emb": "rotary",
10 |   "rotary-pct": 0.25,
11 |   "no-weight-tying": true,
12 |   "gpt-j-residual": true,
13 |   "output-layer-parallelism": "column",
14 |   
15 |   "attention-config": [[["flash"], 32]],
16 | 
17 |   "scaled-upper-triang-masked-softmax-fusion": true,
18 |   "bias-gelu-fusion": true,
19 | 
20 |   "train_batch_size": 10,
21 |   "train_micro_batch_size_per_gpu": 10,
22 |   "gradient_accumulation_steps": 1,
23 |   "data-impl": "mmap",
24 |   "num_workers": 1,
25 | 
26 |   # optimizer settings
27 |   "optimizer": {
28 |     "type": "Adam",
29 |     "params": {
30 |       "lr": 0.00016,
31 |       "betas": [0.9, 0.95],
32 |       "eps": 1.0e-8
33 |     }
34 |   },
35 |   "min_lr": 0.000016,
36 | 
37 |   "zero_optimization": {
38 |     "stage": 1,
39 |     "allgather_partitions": true,
40 |     "allgather_bucket_size": 500000000,
41 |     "overlap_comm": true,
42 |     "reduce_scatter": true,
43 |     "reduce_bucket_size": 500000000,
44 |     "contiguous_gradients": true,
45 |     "cpu_offload": false
46 |   },
47 | 
48 |   # activation checkpointing
49 |   "checkpoint-activations": true,
50 |   "checkpoint-num-layers": 1,
51 |   "partition-activations": true,
52 |   "synchronize-each-layer": true,
53 | 
54 |   # regularization
55 |   "gradient_clipping": 1.0,
56 |   "weight-decay": 0.1,
57 |   "hidden-dropout": 0,
58 |   "attention-dropout": 0,
59 | 
60 |   # precision settings
61 |   "fp16": {
62 |     "fp16": true,
63 |     "enabled": true,
64 |     "loss_scale": 0,
65 |     "loss_scale_window": 1000,
66 |     "initial_scale_power": 12,
67 |     "hysteresis": 2,
68 |     "min_loss_scale": 1
69 |   },
70 | 
71 |   # train settings
72 |   "train-iters": 100000,
73 |   "lr-decay-iters": 100000,
74 |   "distributed-backend": "nccl",
75 |   "lr-decay-style": "cosine",
76 |   "warmup": 0.01,
77 |   "checkpoint-factor": 10000,
78 |   "eval-interval": 1000,
79 | 
80 |   # logging
81 |   "log-interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | 
85 |   # tokenizer settings
86 |   "tokenizer-type": "HFTokenizer",
87 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
88 | 
89 |   # wandb settings
90 |   "use_wandb": true,
91 |   "wandb_project": "neox",
92 | 
93 |   # domain specific settings
94 |   "use_named_train_datasets": true,
95 |   "use_named_eval_datasets": true,
96 |   "max_validation_samples_per_dataset": 5000,
97 | 
98 | }


--------------------------------------------------------------------------------
/configs/bf16_125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled-upper-triang-masked-softmax-fusion": false,
20 |    "bias-gelu-fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.0006,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
33 |    "zero_optimization": {
34 |     "stage": 0,
35 |     "allgather_partitions": True,
36 |     "allgather_bucket_size": 500000000,
37 |     "overlap_comm": True,
38 |     "reduce_scatter": True,
39 |     "reduce_bucket_size": 500000000,
40 |     "contiguous_gradients": True,
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data-impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint-activations": true,
50 |    "checkpoint-num-layers": 1,
51 |    "partition-activations": true,
52 |    "synchronize-each-layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight-decay": 0.0,
57 |    "hidden-dropout": 0.0,
58 |    "attention-dropout": 0.0,
59 | 
60 |    # precision settings
61 |    "fp16": {
62 |      "enabled": true,
63 |      "type": "bfloat16", # set bf16 as precision
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
71 |    # misc. training settings
72 |    "train-iters": 320000,
73 |    "lr-decay-iters": 320000,
74 |    "distributed-backend": "nccl",
75 |    "lr-decay-style": "cosine",
76 |    "warmup": 0.01,
77 |    "checkpoint-factor": 10000,
78 |    "eval-interval": 1000,
79 |    "eval-iters": 10,
80 | 
81 |    # logging
82 |    "log-interval": 100,
83 |    "steps_per_print": 10,
84 |    "keep-last-n-checkpoints": 4,
85 |    "wall_clock_breakdown": true,
86 | }
87 | 


--------------------------------------------------------------------------------
/configs/175B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 96,
10 |    "hidden-size": 12288,
11 |    "num-attention-heads": 96,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    # optimizer settings
29 |    "optimizer": {
30 |      "type": "Adam",
31 |      "params": {
32 |        "lr": 0.00006,
33 |        "betas": [0.9, 0.95],
34 |        "eps": 1.0e-8,
35 |      }
36 |    },
37 |    "min_lr": 0.000006,
38 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
39 |    "zero_optimization": {
40 |     "stage": 1,
41 |     "allgather_partitions": True,
42 |     "allgather_bucket_size": 500000000,
43 |     "overlap_comm": True,
44 |     "reduce_scatter": True,
45 |     "reduce_bucket_size": 500000000,
46 |     "contiguous_gradients": True,
47 |   },
48 | 
49 |    # batch / data settings
50 |    "train_micro_batch_size_per_gpu": 4,
51 |    "data-impl": "mmap",
52 | 
53 |    # activation checkpointing
54 |    "checkpoint-activations": true,
55 |    "checkpoint-num-layers": 1,
56 |    "partition-activations": true,
57 |    "synchronize-each-layer": true,
58 | 
59 |    # regularization
60 |    "gradient_clipping": 1.0,
61 |    "weight-decay": 0.1,
62 |    "hidden-dropout": 0,
63 |    "attention-dropout": 0,
64 | 
65 |    # precision settings
66 |    "fp16": {
67 |      "fp16": true,
68 |      "enabled": true,
69 |      "loss_scale": 0,
70 |      "loss_scale_window": 1000,
71 |      "hysteresis": 2,
72 |      "min_loss_scale": 1
73 |    },
74 | 
75 |    # misc. training settings
76 |    "train-iters": 320000,
77 |    "lr-decay-iters": 320000,
78 |    "distributed-backend": "nccl",
79 |    "lr-decay-style": "cosine",
80 |    "warmup": 0.01,
81 |    "checkpoint-factor": 10000,
82 |    "eval-interval": 1000,
83 |    "eval-iters": 10,
84 | 
85 |    # logging
86 |    "log-interval": 100,
87 |    "steps_per_print": 10,
88 |    "keep-last-n-checkpoints": 4,
89 |    "wall_clock_breakdown": true,
90 | }
91 | 


--------------------------------------------------------------------------------
/configs/350M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 24,
10 |    "hidden-size": 1024,
11 |    "num-attention-heads": 16,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    # optimizer settings
29 |    "optimizer": {
30 |      "type": "Adam",
31 |      "params": {
32 |        "lr": 0.0003,
33 |        "betas": [0.9, 0.95],
34 |        "eps": 1.0e-8,
35 |      }
36 |    },
37 |    "min_lr": 0.00003,
38 | 
39 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 |    "zero_optimization": {
41 |     "stage": 1,
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 |    # batch / data settings
50 |    "train_micro_batch_size_per_gpu": 4,
51 |    "data-impl": "mmap",
52 | 
53 |    # activation checkpointing
54 |    "checkpoint-activations": true,
55 |    "checkpoint-num-layers": 1,
56 |    "partition-activations": true,
57 |    "synchronize-each-layer": true,
58 | 
59 |    # regularization
60 |    "gradient_clipping": 1.0,
61 |    "weight-decay": 0.1,
62 |    "hidden-dropout": 0,
63 |    "attention-dropout": 0,
64 | 
65 |    # precision settings
66 |    "fp16": {
67 |      "fp16": true,
68 |      "enabled": true,
69 |      "loss_scale": 0,
70 |      "loss_scale_window": 1000,
71 |      "hysteresis": 2,
72 |      "min_loss_scale": 1
73 |    },
74 | 
75 |    # misc. training settings
76 |    "train-iters": 320000,
77 |    "lr-decay-iters": 320000,
78 |    "distributed-backend": "nccl",
79 |    "lr-decay-style": "cosine",
80 |    "warmup": 0.01,
81 |    "checkpoint-factor": 10000,
82 |    "eval-interval": 1000,
83 |    "eval-iters": 10,
84 | 
85 |    # logging
86 |    "log-interval": 100,
87 |    "steps_per_print": 10,
88 |    "keep-last-n-checkpoints": 4,
89 |    "wall_clock_breakdown": true,
90 | }
91 | 


--------------------------------------------------------------------------------
/configs/1-3B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 24,
10 |    "hidden-size": 2048,
11 |    "num-attention-heads": 16,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    # optimizer settings
29 |    "optimizer": {
30 |      "type": "Adam",
31 |      "params": {
32 |        "lr": 0.0002,
33 |        "betas": [0.9, 0.95],
34 |        "eps":  1.0e-8,
35 |      }
36 |    },
37 |    "min_lr": 0.00002,
38 | 
39 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 |    "zero_optimization": {
41 |     "stage": 1,
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |    # batch / data settings
51 |    "train_micro_batch_size_per_gpu": 4,
52 |    "data-impl": "mmap",
53 | 
54 |    # activation checkpointing
55 |    "checkpoint-activations": true,
56 |    "checkpoint-num-layers": 1,
57 |    "partition-activations": true,
58 |    "synchronize-each-layer": true,
59 | 
60 |    # regularization
61 |    "gradient_clipping": 1.0,
62 |    "weight-decay": 0.1,
63 |    "hidden-dropout": 0,
64 |    "attention-dropout": 0,
65 | 
66 |    # precision settings
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    # misc. training settings
77 |    "train-iters": 320000,
78 |    "lr-decay-iters": 320000,
79 |    "distributed-backend": "nccl",
80 |    "lr-decay-style": "cosine",
81 |    "warmup": 0.01,
82 |    "checkpoint-factor": 10000,
83 |    "eval-interval": 1000,
84 |    "eval-iters": 10,
85 | 
86 |    # logging
87 |    "log-interval": 100,
88 |    "steps_per_print": 10,
89 |    "keep-last-n-checkpoints": 4,
90 |    "wall_clock_breakdown": true,
91 | }
92 | 


--------------------------------------------------------------------------------
/configs/2-7B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 32,
10 |    "hidden-size": 2560,
11 |    "num-attention-heads": 32,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    # optimizer settings
29 |    "optimizer": {
30 |      "type": "Adam",
31 |      "params": {
32 |        "lr": 0.00016,
33 |        "betas": [0.9, 0.95],
34 |        "eps": 1.0e-8,
35 |      }
36 |    },
37 |    "min_lr": 0.000016,
38 | 
39 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 |    "zero_optimization": {
41 |     "stage": 1,
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |    # batch / data settings
51 |    "train_micro_batch_size_per_gpu": 4,
52 |    "data-impl": "mmap",
53 | 
54 |    # activation checkpointing
55 |    "checkpoint-activations": true,
56 |    "checkpoint-num-layers": 1,
57 |    "partition-activations": true,
58 |    "synchronize-each-layer": true,
59 | 
60 |    # regularization
61 |    "gradient_clipping": 1.0,
62 |    "weight-decay": 0.1,
63 |    "hidden-dropout": 0,
64 |    "attention-dropout": 0,
65 | 
66 |    # precision settings
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    # misc. training settings
77 |    "train-iters": 320000,
78 |    "lr-decay-iters": 320000,
79 |    "distributed-backend": "nccl",
80 |    "lr-decay-style": "cosine",
81 |    "warmup": 0.01,
82 |    "checkpoint-factor": 10000,
83 |    "eval-interval": 1000,
84 |    "eval-iters": 10,
85 | 
86 |    # logging
87 |    "log-interval": 100,
88 |    "steps_per_print": 10,
89 |    "keep-last-n-checkpoints": 4,
90 |    "wall_clock_breakdown": true,
91 | }
92 | 


--------------------------------------------------------------------------------
/configs/6-7B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 32,
10 |    "hidden-size": 4096,
11 |    "num-attention-heads": 32,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    # optimizer settings
29 |    "optimizer": {
30 |      "type": "Adam",
31 |      "params": {
32 |        "lr": 0.00012,
33 |        "betas": [0.9, 0.95],
34 |        "eps": 1.0e-8,
35 |      }
36 |    },
37 | 
38 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
39 |    "zero_optimization": {
40 |     "stage": 1,
41 |     "allgather_partitions": True,
42 |     "allgather_bucket_size": 500000000,
43 |     "overlap_comm": True,
44 |     "reduce_scatter": True,
45 |     "reduce_bucket_size": 500000000,
46 |     "contiguous_gradients": True,
47 |   },
48 |    "min_lr": 0.000012,
49 | 
50 |    # batch / data settings
51 |    "train_micro_batch_size_per_gpu": 4,
52 |    "data-impl": "mmap",
53 | 
54 |    # activation checkpointing
55 |    "checkpoint-activations": true,
56 |    "checkpoint-num-layers": 1,
57 |    "partition-activations": true,
58 |    "synchronize-each-layer": true,
59 | 
60 |    # regularization
61 |    "gradient_clipping": 1.0,
62 |    "weight-decay": 0.1,
63 |    "hidden-dropout": 0,
64 |    "attention-dropout": 0,
65 | 
66 |    # precision settings
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    # misc. training settings
77 |    "train-iters": 320000,
78 |    "lr-decay-iters": 320000,
79 |    "distributed-backend": "nccl",
80 |    "lr-decay-style": "cosine",
81 |    "warmup": 0.01,
82 |    "checkpoint-factor": 10000,
83 |    "eval-interval": 1000,
84 |    "eval-iters": 10,
85 | 
86 |    # logging
87 |    "log-interval": 100,
88 |    "steps_per_print": 10,
89 |    "keep-last-n-checkpoints": 4,
90 |    "wall_clock_breakdown": true,
91 | }
92 | 


--------------------------------------------------------------------------------
/configs/13B.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 40,
10 |    "hidden-size": 5120,
11 |    "num-attention-heads": 40,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 | 
29 |    # optimizer settings
30 |    "optimizer": {
31 |      "type": "Adam",
32 |      "params": {
33 |        "lr": 0.0001,
34 |        "betas": [0.9, 0.95],
35 |        "eps": 1.0e-8,
36 |      }
37 |    },
38 | 
39 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 |    "zero_optimization": {
41 |     "stage": 1,
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 |    "min_lr": 0.00001,
50 | 
51 |    # batch / data settings
52 |    "train_micro_batch_size_per_gpu": 4,
53 |    "data-impl": "mmap",
54 | 
55 |    # activation checkpointing
56 |    "checkpoint-activations": true,
57 |    "checkpoint-num-layers": 1,
58 |    "partition-activations": true,
59 |    "synchronize-each-layer": true,
60 | 
61 |    # regularization
62 |    "gradient_clipping": 1.0,
63 |    "weight-decay": 0.1,
64 |    "hidden-dropout": 0,
65 |    "attention-dropout": 0,
66 | 
67 |    # precision settings
68 |    "fp16": {
69 |      "fp16": true,
70 |      "enabled": true,
71 |      "loss_scale": 0,
72 |      "loss_scale_window": 1000,
73 |      "hysteresis": 2,
74 |      "min_loss_scale": 1
75 |    },
76 | 
77 |    # misc. training settings
78 |    "train-iters": 320000,
79 |    "lr-decay-iters": 320000,
80 |    "distributed-backend": "nccl",
81 |    "lr-decay-style": "cosine",
82 |    "warmup": 0.01,
83 |    "checkpoint-factor": 10000,
84 |    "eval-interval": 1000,
85 |    "eval-iters": 10,
86 | 
87 |    # logging
88 |    "log-interval": 100,
89 |    "steps_per_print": 10,
90 |    "keep-last-n-checkpoints": 4,
91 |    "wall_clock_breakdown": true,
92 | }
93 | 


--------------------------------------------------------------------------------
/configs/760M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 24,
10 |    "hidden-size": 1536,
11 |    "num-attention-heads": 16,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    # optimizer settings
29 |    "optimizer": {
30 |      "type": "Adam",
31 |      "params": {
32 |        "lr": 0.00025,
33 |        "betas": [0.9, 0.999],
34 |        "eps": 1.0e-8,
35 |      }
36 |    },
37 |    "min_lr": 0.000025,
38 | 
39 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40 |    "zero_optimization": {
41 |     "stage": 1,
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |    # batch / data settings
51 |    "train_micro_batch_size_per_gpu": 4,
52 |    "data-impl": "mmap",
53 | 
54 |    # activation checkpointing
55 |    "checkpoint-activations": true,
56 |    "checkpoint-num-layers": 1,
57 |    "partition-activations": true,
58 |    "synchronize-each-layer": true,
59 | 
60 |    # regularization
61 |    "gradient_clipping": 1.0,
62 |    "weight-decay": 0.1,
63 |    "hidden-dropout": 0,
64 |    "attention-dropout": 0,
65 | 
66 |    # precision settings
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    # misc. training settings
77 |    "train-iters": 320000,
78 |    "lr-decay-iters": 320000,
79 |    "distributed-backend": "nccl",
80 |    "lr-decay-style": "cosine",
81 |    "warmup": 0.01,
82 |    "checkpoint-factor": 10000,
83 |    "eval-interval": 1000,
84 |    "eval-iters": 10,
85 | 
86 |    # logging
87 |    "log-interval": 100,
88 |    "steps_per_print": 10,
89 |    "keep-last-n-checkpoints": 4,
90 |    "wall_clock_breakdown": true,
91 | }
92 | 


--------------------------------------------------------------------------------
/configs/125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe-parallel-size": 1,
 6 |    "model-parallel-size": 1,
 7 | 
 8 |    # model settings
 9 |    "num-layers": 12,
10 |    "hidden-size": 768,
11 |    "num-attention-heads": 12,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "no-weight-tying": true,
17 |    "gpt_j_residual": false,
18 |    "output_layer_parallelism": "column",
19 | 
20 |    # these should provide some speedup but takes a while to build, set to true if desired
21 |    "scaled-upper-triang-masked-softmax-fusion": false,
22 |    "bias-gelu-fusion": false,
23 | 
24 |    # init methods
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 | 
29 |    # optimizer settings
30 |    "optimizer": {
31 |      "type": "Adam",
32 |      "params": {
33 |        "lr": 0.0006,
34 |        "betas": [0.9, 0.95],
35 |        "eps": 1.0e-8,
36 |      }
37 |    },
38 |    "min_lr": 0.00006,
39 | 
40 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41 |    "zero_optimization": {
42 |     "stage": 1,
43 |     "allgather_partitions": True,
44 |     "allgather_bucket_size": 500000000,
45 |     "overlap_comm": True,
46 |     "reduce_scatter": True,
47 |     "reduce_bucket_size": 500000000,
48 |     "contiguous_gradients": True,
49 |   },
50 | 
51 |    # batch / data settings
52 |    "train_micro_batch_size_per_gpu": 4,
53 |    "data-impl": "mmap",
54 | 
55 |    # activation checkpointing
56 |    "checkpoint-activations": true,
57 |    "checkpoint-num-layers": 1,
58 |    "partition-activations": true,
59 |    "synchronize-each-layer": true,
60 | 
61 |    # regularization
62 |    "gradient_clipping": 1.0,
63 |    "weight-decay": 0.1,
64 |    "hidden-dropout": 0.0,
65 |    "attention-dropout": 0.0,
66 | 
67 |    # precision settings
68 |    "fp16": {
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    # misc. training settings
77 |    "train-iters": 320000,
78 |    "lr-decay-iters": 320000,
79 |    "distributed-backend": "nccl",
80 |    "lr-decay-style": "cosine",
81 |    "warmup": 0.01,
82 |    "checkpoint-factor": 10000,
83 |    "eval-interval": 1000,
84 |    "eval-iters": 10,
85 | 
86 |    # logging
87 |    "log-interval": 100,
88 |    "steps_per_print": 10,
89 |    "keep-last-n-checkpoints": 4,
90 |    "wall_clock_breakdown": true,
91 | 
92 |   #  networking
93 |   "hostfile": "/mock_path"
94 | }
95 | 


--------------------------------------------------------------------------------
/alon_configs/models/3B.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings, 2,846,767,360 parameters
  3 |   "num-layers": 32,
  4 |   "hidden-size": 2560,
  5 |   "num-attention-heads": 32,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "norm": "layernorm",
  9 |   "pos-emb": "rotary",
 10 |   "rotary-pct": 0.25,
 11 |   "no-weight-tying": true,
 12 |   "gpt-j-residual": true,
 13 |   "output-layer-parallelism": "column",
 14 |   
 15 |   "attention-config": [[["flash"], 32]],
 16 | 
 17 |   "scaled-upper-triang-masked-softmax-fusion": true,
 18 |   "bias-gelu-fusion": true,
 19 | 
 20 |   "train_batch_size": 480,
 21 |   "train_micro_batch_size_per_gpu": 10,
 22 |   "gradient_accumulation_steps": 6,
 23 |   "data-impl": "mmap",
 24 |   "num_workers": 1,
 25 | 
 26 |   # optimizer settings
 27 |   "optimizer": {
 28 |     "type": "Adam",
 29 |     "params": {
 30 |       "lr": 0.00016,
 31 |       "betas": [0.9, 0.95],
 32 |       "eps": 1.0e-8
 33 |     }
 34 |   },
 35 |   "min_lr": 0.000016,
 36 | 
 37 |   "zero_optimization": {
 38 |     "stage": 1,
 39 |     "allgather_partitions": true,
 40 |     "allgather_bucket_size": 500000000,
 41 |     "overlap_comm": true,
 42 |     "reduce_scatter": true,
 43 |     "reduce_bucket_size": 500000000,
 44 |     "contiguous_gradients": true,
 45 |     "cpu_offload": false
 46 |   },
 47 | 
 48 |   # activation checkpointing
 49 |   "checkpoint-activations": true,
 50 |   "checkpoint-num-layers": 1,
 51 |   "partition-activations": true,
 52 |   "synchronize-each-layer": true,
 53 | 
 54 |   # regularization
 55 |   "gradient_clipping": 1.0,
 56 |   "weight-decay": 0.1,
 57 |   "hidden-dropout": 0,
 58 |   "attention-dropout": 0,
 59 | 
 60 |   # precision settings
 61 |   "fp16": {
 62 |     "fp16": true,
 63 |     "enabled": true,
 64 |     "loss_scale": 0,
 65 |     "loss_scale_window": 1000,
 66 |     "initial_scale_power": 12,
 67 |     "hysteresis": 2,
 68 |     "min_loss_scale": 1
 69 |   },
 70 | 
 71 |   # train settings
 72 |   "train-iters": 100000,
 73 |   "lr-decay-iters": 100000,
 74 |   "distributed-backend": "nccl",
 75 |   "lr-decay-style": "cosine",
 76 |   "warmup": 0.01,
 77 |   "checkpoint-factor": 10000,
 78 |   "eval-interval": 1000,
 79 | 
 80 |   # logging
 81 |   "log-interval": 10,
 82 |   "steps_per_print": 10,
 83 |   "wall_clock_breakdown": true,
 84 | 
 85 |   # tokenizer settings
 86 |   "tokenizer-type": "HFTokenizer",
 87 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 88 | 
 89 |   # wandb settings
 90 |   "use_wandb": true,
 91 |   "wandb_project": "neox",
 92 | 
 93 |   # domain specific settings
 94 |   "use_named_train_datasets": true,
 95 |   "use_named_eval_datasets": true,
 96 |   "max_validation_samples_per_dataset": 5000,
 97 | 
 98 |   # "keep_last_n_checkpoints": 1
 99 | 
100 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_410m_1gpu.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings
  3 |   "num-layers": 24,
  4 |   "hidden-size": 1024,
  5 |   "num-attention-heads": 16,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 | 
 14 |   "attention-config": [[["flash"], 24]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   "train_batch_size": 10,
 20 |   "train_micro_batch_size_per_gpu": 10,
 21 |   "gradient_accumulation_steps": 1,
 22 |   "data-impl": "mmap",
 23 |   "num_workers": 1,
 24 | 
 25 |   # optimizer settings
 26 |   "optimizer": {
 27 |     "type": "Adam",
 28 |     "params": {
 29 |       "lr": 0.0003,
 30 |       "betas": [0.9, 0.95],
 31 |       "eps": 1.0e-8
 32 |     }
 33 |   },
 34 |   "min_lr": 0.00003,
 35 | 
 36 |   "zero_optimization": {
 37 |     "stage": 1,
 38 |     "allgather_partitions": true,
 39 |     "allgather_bucket_size": 500000000,
 40 |     "overlap_comm": true,
 41 |     "reduce_scatter": true,
 42 |     "reduce_bucket_size": 500000000,
 43 |     "contiguous_gradients": true,
 44 |     "cpu_offload": false
 45 |   },
 46 | 
 47 |   # activation checkpointing
 48 |   "checkpoint-activations": true,
 49 |   "checkpoint-num-layers": 1,
 50 |   "partition-activations": true,
 51 |   "synchronize-each-layer": true,
 52 | 
 53 |   # regularization
 54 |   "gradient_clipping": 1.0,
 55 |   "weight-decay": 0.1,
 56 |   "hidden-dropout": 0,
 57 |   "attention-dropout": 0,
 58 | 
 59 |   # precision settings
 60 |   "fp16": {
 61 |     "fp16": true,
 62 |     "enabled": true,
 63 |     "loss_scale": 0,
 64 |     "loss_scale_window": 1000,
 65 |     "initial_scale_power": 12,
 66 |     "hysteresis": 2,
 67 |     "min_loss_scale": 1
 68 |   },
 69 | 
 70 |   # train settings
 71 |   "train-iters": 100000,
 72 |   "lr-decay-iters": 100000,
 73 |   "distributed-backend": "nccl",
 74 |   "lr-decay-style": "cosine",
 75 |   "warmup": 0.01,
 76 |   "checkpoint-factor": 10000,
 77 |   "eval-interval": 1000,
 78 | 
 79 |   # logging
 80 |   "log-interval": 100,
 81 |   "steps_per_print": 10,
 82 |   "wall_clock_breakdown": true,
 83 | 
 84 |   # tokenizer settings
 85 |   "tokenizer-type": "HFTokenizer",
 86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 87 | 
 88 |   # wandb settings
 89 |   "use_wandb": true,
 90 |   "wandb_project": "neox",
 91 | 
 92 |   # domain specific settings
 93 |   "use_named_train_datasets": true,
 94 |   "use_named_eval_datasets": true,
 95 |   "max_validation_samples_per_dataset": 5000,
 96 | 
 97 |   "keep_last_n_checkpoints": 1,
 98 | 
 99 |   "include": "localhost:0",
100 |   "world_size": 1,
101 |   "master_port": 12344,
102 | 
103 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_1gpu_local1.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings
  3 |   "num-layers": 16,
  4 |   "hidden-size": 2048,
  5 |   "num-attention-heads": 16,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 16]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   "train_batch_size": 10,
 20 |   "train_micro_batch_size_per_gpu": 10,
 21 |   "gradient_accumulation_steps": 1,
 22 |   "data-impl": "mmap",
 23 |   "num_workers": 1,
 24 | 
 25 |   # optimizer settings
 26 |   "optimizer": {
 27 |     "type": "Adam",
 28 |     "params": {
 29 |       "lr": 0.00025,
 30 |       "betas": [0.9, 0.95],
 31 |       "eps": 1.0e-8
 32 |     }
 33 |   },
 34 |   "min_lr": 0.000025,
 35 | 
 36 |   "zero_optimization": {
 37 |     "stage": 1,
 38 |     "allgather_partitions": true,
 39 |     "allgather_bucket_size": 500000000,
 40 |     "overlap_comm": true,
 41 |     "reduce_scatter": true,
 42 |     "reduce_bucket_size": 500000000,
 43 |     "contiguous_gradients": true,
 44 |     "cpu_offload": false
 45 |   },
 46 | 
 47 |   # activation checkpointing
 48 |   "checkpoint-activations": true,
 49 |   "checkpoint-num-layers": 1,
 50 |   "partition-activations": true,
 51 |   "synchronize-each-layer": true,
 52 | 
 53 |   # regularization
 54 |   "gradient_clipping": 1.0,
 55 |   "weight-decay": 0.1,
 56 |   "hidden-dropout": 0,
 57 |   "attention-dropout": 0,
 58 | 
 59 |   # precision settings
 60 |   "fp16": {
 61 |     "fp16": true,
 62 |     "enabled": true,
 63 |     "loss_scale": 0,
 64 |     "loss_scale_window": 1000,
 65 |     "initial_scale_power": 12,
 66 |     "hysteresis": 2,
 67 |     "min_loss_scale": 1
 68 |   },
 69 | 
 70 |   # train settings
 71 |   "train-iters": 100000,
 72 |   "lr-decay-iters": 100000,
 73 |   "distributed-backend": "nccl",
 74 |   "lr-decay-style": "cosine",
 75 |   "warmup": 0.01,
 76 |   "checkpoint-factor": 10000,
 77 |   "eval-interval": 1000,
 78 | 
 79 |   # logging
 80 |   "log-interval": 100,
 81 |   "steps_per_print": 10,
 82 |   "wall_clock_breakdown": true,
 83 | 
 84 |   # tokenizer settings
 85 |   "tokenizer-type": "HFTokenizer",
 86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 87 | 
 88 |   # wandb settings
 89 |   "use_wandb": true,
 90 |   "wandb_project": "neox",
 91 | 
 92 |   # domain specific settings
 93 |   "use_named_train_datasets": true,
 94 |   "use_named_eval_datasets": true,
 95 |   "max_validation_samples_per_dataset": 5000,
 96 | 
 97 |   "keep_last_n_checkpoints": 1,
 98 | 
 99 |   "include": "localhost:1",
100 |   "world_size": 1,
101 |   "master_port": 12341,
102 | 
103 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_1gpu_local2.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings
  3 |   "num-layers": 16,
  4 |   "hidden-size": 2048,
  5 |   "num-attention-heads": 16,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 16]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   "train_batch_size": 10,
 20 |   "train_micro_batch_size_per_gpu": 10,
 21 |   "gradient_accumulation_steps": 1,
 22 |   "data-impl": "mmap",
 23 |   "num_workers": 1,
 24 | 
 25 |   # optimizer settings
 26 |   "optimizer": {
 27 |     "type": "Adam",
 28 |     "params": {
 29 |       "lr": 0.00025,
 30 |       "betas": [0.9, 0.95],
 31 |       "eps": 1.0e-8
 32 |     }
 33 |   },
 34 |   "min_lr": 0.000025,
 35 | 
 36 |   "zero_optimization": {
 37 |     "stage": 1,
 38 |     "allgather_partitions": true,
 39 |     "allgather_bucket_size": 500000000,
 40 |     "overlap_comm": true,
 41 |     "reduce_scatter": true,
 42 |     "reduce_bucket_size": 500000000,
 43 |     "contiguous_gradients": true,
 44 |     "cpu_offload": false
 45 |   },
 46 | 
 47 |   # activation checkpointing
 48 |   "checkpoint-activations": true,
 49 |   "checkpoint-num-layers": 1,
 50 |   "partition-activations": true,
 51 |   "synchronize-each-layer": true,
 52 | 
 53 |   # regularization
 54 |   "gradient_clipping": 1.0,
 55 |   "weight-decay": 0.1,
 56 |   "hidden-dropout": 0,
 57 |   "attention-dropout": 0,
 58 | 
 59 |   # precision settings
 60 |   "fp16": {
 61 |     "fp16": true,
 62 |     "enabled": true,
 63 |     "loss_scale": 0,
 64 |     "loss_scale_window": 1000,
 65 |     "initial_scale_power": 12,
 66 |     "hysteresis": 2,
 67 |     "min_loss_scale": 1
 68 |   },
 69 | 
 70 |   # train settings
 71 |   "train-iters": 100000,
 72 |   "lr-decay-iters": 100000,
 73 |   "distributed-backend": "nccl",
 74 |   "lr-decay-style": "cosine",
 75 |   "warmup": 0.01,
 76 |   "checkpoint-factor": 10000,
 77 |   "eval-interval": 1000,
 78 | 
 79 |   # logging
 80 |   "log-interval": 100,
 81 |   "steps_per_print": 10,
 82 |   "wall_clock_breakdown": true,
 83 | 
 84 |   # tokenizer settings
 85 |   "tokenizer-type": "HFTokenizer",
 86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 87 | 
 88 |   # wandb settings
 89 |   "use_wandb": true,
 90 |   "wandb_project": "neox",
 91 | 
 92 |   # domain specific settings
 93 |   "use_named_train_datasets": true,
 94 |   "use_named_eval_datasets": true,
 95 |   "max_validation_samples_per_dataset": 5000,
 96 | 
 97 |   "keep_last_n_checkpoints": 1,
 98 | 
 99 |   "include": "localhost:2",
100 |   "world_size": 1,
101 |   "master_port": 12342,
102 | 
103 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_1gpu_local3.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings
  3 |   "num-layers": 16,
  4 |   "hidden-size": 2048,
  5 |   "num-attention-heads": 16,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 16]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   "train_batch_size": 10,
 20 |   "train_micro_batch_size_per_gpu": 10,
 21 |   "gradient_accumulation_steps": 1,
 22 |   "data-impl": "mmap",
 23 |   "num_workers": 1,
 24 | 
 25 |   # optimizer settings
 26 |   "optimizer": {
 27 |     "type": "Adam",
 28 |     "params": {
 29 |       "lr": 0.00025,
 30 |       "betas": [0.9, 0.95],
 31 |       "eps": 1.0e-8
 32 |     }
 33 |   },
 34 |   "min_lr": 0.000025,
 35 | 
 36 |   "zero_optimization": {
 37 |     "stage": 1,
 38 |     "allgather_partitions": true,
 39 |     "allgather_bucket_size": 500000000,
 40 |     "overlap_comm": true,
 41 |     "reduce_scatter": true,
 42 |     "reduce_bucket_size": 500000000,
 43 |     "contiguous_gradients": true,
 44 |     "cpu_offload": false
 45 |   },
 46 | 
 47 |   # activation checkpointing
 48 |   "checkpoint-activations": true,
 49 |   "checkpoint-num-layers": 1,
 50 |   "partition-activations": true,
 51 |   "synchronize-each-layer": true,
 52 | 
 53 |   # regularization
 54 |   "gradient_clipping": 1.0,
 55 |   "weight-decay": 0.1,
 56 |   "hidden-dropout": 0,
 57 |   "attention-dropout": 0,
 58 | 
 59 |   # precision settings
 60 |   "fp16": {
 61 |     "fp16": true,
 62 |     "enabled": true,
 63 |     "loss_scale": 0,
 64 |     "loss_scale_window": 1000,
 65 |     "initial_scale_power": 12,
 66 |     "hysteresis": 2,
 67 |     "min_loss_scale": 1
 68 |   },
 69 | 
 70 |   # train settings
 71 |   "train-iters": 100000,
 72 |   "lr-decay-iters": 100000,
 73 |   "distributed-backend": "nccl",
 74 |   "lr-decay-style": "cosine",
 75 |   "warmup": 0.01,
 76 |   "checkpoint-factor": 10000,
 77 |   "eval-interval": 1000,
 78 | 
 79 |   # logging
 80 |   "log-interval": 100,
 81 |   "steps_per_print": 10,
 82 |   "wall_clock_breakdown": true,
 83 | 
 84 |   # tokenizer settings
 85 |   "tokenizer-type": "HFTokenizer",
 86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 87 | 
 88 |   # wandb settings
 89 |   "use_wandb": true,
 90 |   "wandb_project": "neox",
 91 | 
 92 |   # domain specific settings
 93 |   "use_named_train_datasets": true,
 94 |   "use_named_eval_datasets": true,
 95 |   "max_validation_samples_per_dataset": 5000,
 96 | 
 97 |   "keep_last_n_checkpoints": 1,
 98 | 
 99 |   "include": "localhost:3",
100 |   "world_size": 1,
101 |   "master_port": 12343,
102 | 
103 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_1gpu_local4.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings
  3 |   "num-layers": 16,
  4 |   "hidden-size": 2048,
  5 |   "num-attention-heads": 16,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 16]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   "train_batch_size": 10,
 20 |   "train_micro_batch_size_per_gpu": 10,
 21 |   "gradient_accumulation_steps": 1,
 22 |   "data-impl": "mmap",
 23 |   "num_workers": 1,
 24 | 
 25 |   # optimizer settings
 26 |   "optimizer": {
 27 |     "type": "Adam",
 28 |     "params": {
 29 |       "lr": 0.00025,
 30 |       "betas": [0.9, 0.95],
 31 |       "eps": 1.0e-8
 32 |     }
 33 |   },
 34 |   "min_lr": 0.000025,
 35 | 
 36 |   "zero_optimization": {
 37 |     "stage": 1,
 38 |     "allgather_partitions": true,
 39 |     "allgather_bucket_size": 500000000,
 40 |     "overlap_comm": true,
 41 |     "reduce_scatter": true,
 42 |     "reduce_bucket_size": 500000000,
 43 |     "contiguous_gradients": true,
 44 |     "cpu_offload": false
 45 |   },
 46 | 
 47 |   # activation checkpointing
 48 |   "checkpoint-activations": true,
 49 |   "checkpoint-num-layers": 1,
 50 |   "partition-activations": true,
 51 |   "synchronize-each-layer": true,
 52 | 
 53 |   # regularization
 54 |   "gradient_clipping": 1.0,
 55 |   "weight-decay": 0.1,
 56 |   "hidden-dropout": 0,
 57 |   "attention-dropout": 0,
 58 | 
 59 |   # precision settings
 60 |   "fp16": {
 61 |     "fp16": true,
 62 |     "enabled": true,
 63 |     "loss_scale": 0,
 64 |     "loss_scale_window": 1000,
 65 |     "initial_scale_power": 12,
 66 |     "hysteresis": 2,
 67 |     "min_loss_scale": 1
 68 |   },
 69 | 
 70 |   # train settings
 71 |   "train-iters": 100000,
 72 |   "lr-decay-iters": 100000,
 73 |   "distributed-backend": "nccl",
 74 |   "lr-decay-style": "cosine",
 75 |   "warmup": 0.01,
 76 |   "checkpoint-factor": 10000,
 77 |   "eval-interval": 1000,
 78 | 
 79 |   # logging
 80 |   "log-interval": 100,
 81 |   "steps_per_print": 10,
 82 |   "wall_clock_breakdown": true,
 83 | 
 84 |   # tokenizer settings
 85 |   "tokenizer-type": "HFTokenizer",
 86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 87 | 
 88 |   # wandb settings
 89 |   "use_wandb": true,
 90 |   "wandb_project": "neox",
 91 | 
 92 |   # domain specific settings
 93 |   "use_named_train_datasets": true,
 94 |   "use_named_eval_datasets": true,
 95 |   "max_validation_samples_per_dataset": 5000,
 96 | 
 97 |   "keep_last_n_checkpoints": 1,
 98 | 
 99 |   "include": "localhost:4",
100 |   "world_size": 1,
101 |   "master_port": 12344,
102 | 
103 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_1B_1gpu_local5.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings
  3 |   "num-layers": 16,
  4 |   "hidden-size": 2048,
  5 |   "num-attention-heads": 16,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 16]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   "train_batch_size": 10,
 20 |   "train_micro_batch_size_per_gpu": 10,
 21 |   "gradient_accumulation_steps": 1,
 22 |   "data-impl": "mmap",
 23 |   "num_workers": 1,
 24 | 
 25 |   # optimizer settings
 26 |   "optimizer": {
 27 |     "type": "Adam",
 28 |     "params": {
 29 |       "lr": 0.00025,
 30 |       "betas": [0.9, 0.95],
 31 |       "eps": 1.0e-8
 32 |     }
 33 |   },
 34 |   "min_lr": 0.000025,
 35 | 
 36 |   "zero_optimization": {
 37 |     "stage": 1,
 38 |     "allgather_partitions": true,
 39 |     "allgather_bucket_size": 500000000,
 40 |     "overlap_comm": true,
 41 |     "reduce_scatter": true,
 42 |     "reduce_bucket_size": 500000000,
 43 |     "contiguous_gradients": true,
 44 |     "cpu_offload": false
 45 |   },
 46 | 
 47 |   # activation checkpointing
 48 |   "checkpoint-activations": true,
 49 |   "checkpoint-num-layers": 1,
 50 |   "partition-activations": true,
 51 |   "synchronize-each-layer": true,
 52 | 
 53 |   # regularization
 54 |   "gradient_clipping": 1.0,
 55 |   "weight-decay": 0.1,
 56 |   "hidden-dropout": 0,
 57 |   "attention-dropout": 0,
 58 | 
 59 |   # precision settings
 60 |   "fp16": {
 61 |     "fp16": true,
 62 |     "enabled": true,
 63 |     "loss_scale": 0,
 64 |     "loss_scale_window": 1000,
 65 |     "initial_scale_power": 12,
 66 |     "hysteresis": 2,
 67 |     "min_loss_scale": 1
 68 |   },
 69 | 
 70 |   # train settings
 71 |   "train-iters": 100000,
 72 |   "lr-decay-iters": 100000,
 73 |   "distributed-backend": "nccl",
 74 |   "lr-decay-style": "cosine",
 75 |   "warmup": 0.01,
 76 |   "checkpoint-factor": 10000,
 77 |   "eval-interval": 1000,
 78 | 
 79 |   # logging
 80 |   "log-interval": 100,
 81 |   "steps_per_print": 10,
 82 |   "wall_clock_breakdown": true,
 83 | 
 84 |   # tokenizer settings
 85 |   "tokenizer-type": "HFTokenizer",
 86 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 87 | 
 88 |   # wandb settings
 89 |   "use_wandb": true,
 90 |   "wandb_project": "neox",
 91 | 
 92 |   # domain specific settings
 93 |   "use_named_train_datasets": true,
 94 |   "use_named_eval_datasets": true,
 95 |   "max_validation_samples_per_dataset": 5000,
 96 | 
 97 |   "keep_last_n_checkpoints": 1,
 98 | 
 99 |   "include": "localhost:5",
100 |   "world_size": 1,
101 |   "master_port": 12345,
102 | 
103 | }


--------------------------------------------------------------------------------
/alon_configs/models/160m.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings, 162,322,944 parameters
  3 |   "num-layers": 12,
  4 |   "hidden-size": 768,
  5 |   "num-attention-heads": 12,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 12]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   # batch size (training on 1 gpu, takes up 37.5Gb of memory)
 20 |   "train_batch_size": 480,
 21 |   "train_micro_batch_size_per_gpu": 60,
 22 |   "gradient_accumulation_steps": 1,
 23 |   "data-impl": "mmap",
 24 |   "num_workers": 1,
 25 | 
 26 |   # optimizer settings
 27 |   "optimizer": {
 28 |     "type": "Adam",
 29 |     "params": {
 30 |       "lr": 0.0006,
 31 |       "betas": [0.9, 0.95],
 32 |       "eps": 1.0e-8
 33 |     }
 34 |   },
 35 |   "min_lr": 0.00006,
 36 | 
 37 |   "zero_optimization": {
 38 |     "stage": 1,
 39 |     "allgather_partitions": true,
 40 |     "allgather_bucket_size": 500000000,
 41 |     "overlap_comm": true,
 42 |     "reduce_scatter": true,
 43 |     "reduce_bucket_size": 500000000,
 44 |     "contiguous_gradients": true,
 45 |     "cpu_offload": false
 46 |   },
 47 | 
 48 |   # activation checkpointing
 49 |   "checkpoint-activations": true,
 50 |   "checkpoint-num-layers": 1,
 51 |   "partition-activations": true,
 52 |   "synchronize-each-layer": true,
 53 | 
 54 |   # regularization
 55 |   "gradient_clipping": 1.0,
 56 |   "weight-decay": 0.1,
 57 |   "hidden-dropout": 0,
 58 |   "attention-dropout": 0,
 59 | 
 60 |   # precision settings
 61 |   "fp16": {
 62 |     "fp16": true,
 63 |     "enabled": true,
 64 |     "loss_scale": 0,
 65 |     "loss_scale_window": 1000,
 66 |     "initial_scale_power": 12,
 67 |     "hysteresis": 2,
 68 |     "min_loss_scale": 1
 69 |   },
 70 | 
 71 |   # train settings
 72 |   "train-iters": 100000,
 73 |   "lr-decay-iters": 100000,
 74 |   "distributed-backend": "nccl",
 75 |   "lr-decay-style": "cosine",
 76 |   "warmup": 0.01,
 77 |   "checkpoint-factor": 10000,
 78 |   "eval-interval": 1000,
 79 | 
 80 |   # logging
 81 |   "log-interval": 100,
 82 |   "steps_per_print": 10,
 83 |   "wall_clock_breakdown": true,
 84 | 
 85 |   # tokenizer settings
 86 |   "tokenizer-type": "HFTokenizer",
 87 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 88 | 
 89 |   # wandb settings
 90 |   "use_wandb": true,
 91 |   "wandb_project": "neox",
 92 | 
 93 |   # domain specific settings
 94 |   "use_named_train_datasets": true,
 95 |   "use_named_eval_datasets": true,
 96 |   "max_validation_samples_per_dataset": 5000,
 97 | 
 98 |   "keep_last_n_checkpoints": 1
 99 | 
100 |   # "include": "localhost:0,1",
101 |   # "world_size": 2,
102 | }


--------------------------------------------------------------------------------
/alon_configs/models/eval_160m_1gpu.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # model settings, 162,322,944 parameters
  3 |   "num-layers": 12,
  4 |   "hidden-size": 768,
  5 |   "num-attention-heads": 12,
  6 |   "seq-length": 1024,
  7 |   "max-position-embeddings": 1024,
  8 |   "pos-emb": "rotary",
  9 |   "rotary-pct": 0.25,
 10 |   "no-weight-tying": true,
 11 |   "gpt-j-residual": true,
 12 |   "output-layer-parallelism": "column",
 13 |   
 14 |   "attention-config": [[["flash"], 12]],
 15 | 
 16 |   "scaled-upper-triang-masked-softmax-fusion": true,
 17 |   "bias-gelu-fusion": true,
 18 | 
 19 |   # batch size (training on 1 gpu, takes up 37.5Gb of memory)
 20 |   "train_batch_size": 10,
 21 |   "train_micro_batch_size_per_gpu": 10,
 22 |   "gradient_accumulation_steps": 1,
 23 |   "data-impl": "mmap",
 24 |   "num_workers": 1,
 25 | 
 26 |   # optimizer settings
 27 |   "optimizer": {
 28 |     "type": "Adam",
 29 |     "params": {
 30 |       "lr": 0.0006,
 31 |       "betas": [0.9, 0.95],
 32 |       "eps": 1.0e-8
 33 |     }
 34 |   },
 35 |   "min_lr": 0.00006,
 36 | 
 37 |   "zero_optimization": {
 38 |     "stage": 1,
 39 |     "allgather_partitions": true,
 40 |     "allgather_bucket_size": 500000000,
 41 |     "overlap_comm": true,
 42 |     "reduce_scatter": true,
 43 |     "reduce_bucket_size": 500000000,
 44 |     "contiguous_gradients": true,
 45 |     "cpu_offload": false
 46 |   },
 47 | 
 48 |   # activation checkpointing
 49 |   "checkpoint-activations": true,
 50 |   "checkpoint-num-layers": 1,
 51 |   "partition-activations": true,
 52 |   "synchronize-each-layer": true,
 53 | 
 54 |   # regularization
 55 |   "gradient_clipping": 1.0,
 56 |   "weight-decay": 0.1,
 57 |   "hidden-dropout": 0,
 58 |   "attention-dropout": 0,
 59 | 
 60 |   # precision settings
 61 |   "fp16": {
 62 |     "fp16": true,
 63 |     "enabled": true,
 64 |     "loss_scale": 0,
 65 |     "loss_scale_window": 1000,
 66 |     "initial_scale_power": 12,
 67 |     "hysteresis": 2,
 68 |     "min_loss_scale": 1
 69 |   },
 70 | 
 71 |   # train settings
 72 |   "train-iters": 100000,
 73 |   "lr-decay-iters": 100000,
 74 |   "distributed-backend": "nccl",
 75 |   "lr-decay-style": "cosine",
 76 |   "warmup": 0.01,
 77 |   "checkpoint-factor": 10000,
 78 |   "eval-interval": 1000,
 79 | 
 80 |   # logging
 81 |   "log-interval": 100,
 82 |   "steps_per_print": 10,
 83 |   "wall_clock_breakdown": true,
 84 | 
 85 |   # tokenizer settings
 86 |   "tokenizer-type": "HFTokenizer",
 87 |   "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json",
 88 | 
 89 |   # wandb settings
 90 |   "use_wandb": true,
 91 |   "wandb_project": "neox",
 92 | 
 93 |   # domain specific settings
 94 |   "use_named_train_datasets": true,
 95 |   "use_named_eval_datasets": true,
 96 |   "max_validation_samples_per_dataset": 5000,
 97 | 
 98 |   "keep_last_n_checkpoints": 1,
 99 | 
100 |   "include": "localhost:0",
101 |   "world_size": 1,
102 |   "master_port": 12344,
103 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # wandb logs
132 | wandb/
133 | 
134 | # data files
135 | data/**/*.idx
136 | data/**/*.bin
137 | data/**/*.json*
138 | data/**/*.txt
139 | data/**/*.gz
140 | data/**/*.np*
141 | data/**/*.npy
142 | checkpoints/
143 | .vscode/
144 | *.pt
145 | *.ckpt
146 | outputs/*
147 | 
148 | #test logs
149 | test_checkpoint/
150 | test_logs/
151 | logs/
152 | tensorboard/
153 | src/
154 | 
155 | # configs
156 | alon_configs/run_specific/*


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Blendable dataset."""
19 | 
20 | import time
21 | 
22 | import numpy as np
23 | import torch
24 | 
25 | from megatron import print_rank_0
26 | from megatron import mpu
27 | 
28 | 
29 | class BlendableDataset(torch.utils.data.Dataset):
30 |     def __init__(self, datasets, weights):
31 |         self.datasets = datasets
32 |         num_datasets = len(datasets)
33 |         assert num_datasets == len(weights)
34 | 
35 |         self.size = 0
36 |         for dataset in self.datasets:
37 |             self.size += len(dataset)
38 | 
39 |         # Normalize weights.
40 |         weights = np.array(weights, dtype=np.float64)
41 |         sum_weights = np.sum(weights)
42 |         assert sum_weights > 0.0
43 |         weights /= sum_weights
44 | 
45 |         # Build indices.
46 |         start_time = time.time()
47 |         assert num_datasets < 255
48 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
49 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
50 | 
51 |         from megatron.data import helpers
52 | 
53 |         helpers.build_blending_indices(
54 |             self.dataset_index,
55 |             self.dataset_sample_index,
56 |             weights,
57 |             num_datasets,
58 |             self.size,
59 |             torch.distributed.get_rank() == 0,
60 |         )
61 | 
62 |         print(
63 |             "> RANK {} elapsed time for building blendable dataset indices: "
64 |             "{:.2f} (sec)".format(
65 |                 torch.distributed.get_rank(), time.time() - start_time
66 |             )
67 |         )
68 | 
69 |     def __len__(self):
70 |         return self.size
71 | 
72 |     def __getitem__(self, idx):
73 |         try:
74 |             dataset_idx = self.dataset_index[idx]
75 |             sample_idx = self.dataset_sample_index[idx]
76 |             return self.datasets[dataset_idx][sample_idx]
77 |         except IndexError:
78 |             new_idx = idx % len(self)
79 |             print(
80 |                 f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
81 |             )
82 |             return self[new_idx]
83 | 


--------------------------------------------------------------------------------
/tests/neox_args/test_neoxargs_usage.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | plausibility check for the usage of neox_args in the megatron codebase
17 | """
18 | import pytest
19 | import re
20 | from ..common import get_root_directory
21 | 
22 | 
23 | @pytest.mark.cpu
24 | def test_neoxargs_usage():
25 |     """ "
26 |     checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs
27 |     """
28 |     from megatron.neox_arguments import NeoXArgs
29 | 
30 |     declared_all = True
31 |     neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys())
32 | 
33 |     # we exclude a number of properties (implemented with the @property decorator) or functions that we know exists
34 |     exclude = set(
35 |         [
36 |             "params_dtype",
37 |             "deepspeed_config",
38 |             "get",
39 |             "pop",
40 |             "get_deepspeed_main_args",
41 |             'optimizer["params"]',
42 |             "attention_config[layer_number]",
43 |             "adlr_autoresume_object",
44 |             "update_value",
45 |             "all_config",
46 |             "tensorboard_writer",
47 |             "tokenizer",
48 |             "train_batch_size]",
49 |             "items",
50 |             "configure_distributed_args",
51 |             "build_tokenizer",
52 |             "attention_config[i]",
53 |             "print",
54 |         ]
55 |     )
56 | 
57 |     # test file by file
58 |     for filename in (get_root_directory() / "megatron").glob("**/*.py"):
59 |         if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]:
60 |             continue
61 | 
62 |         # load file
63 |         with open(filename, "r") as f:
64 |             file_contents = f.read()
65 | 
66 |         # find args matches
67 |         matches = list(
68 |             re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
69 |         )
70 |         if len(matches) == 0:
71 |             continue
72 | 
73 |         # compare
74 |         for match in matches:
75 |             if match not in neox_args_attributes and match not in exclude:
76 |                 print(
77 |                     f"(arguments used not found in neox args): {filename.name}: {match}",
78 |                     flush=True,
79 |                 )
80 |                 declared_all = False
81 | 
82 |     assert declared_all, "all arguments used in code defined in NeoXArgs"
83 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);
26 | 
27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
28 |                        torch::Tensor const& softmax_results,
29 |                        float scale_factor);
30 | 
31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor)
32 | {
33 |     AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
34 |     AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
35 |                    (input.scalar_type() == at::ScalarType::BFloat16),
36 |                "Only fp16 and bf16 are supported");
37 | 
38 |     return fwd_cuda(input, scale_factor);
39 | }
40 | 
41 | torch::Tensor bwd(torch::Tensor const& output_grads,
42 |                   torch::Tensor const& softmax_results,
43 |                   float scale_factor)
44 | {
45 |     AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
46 |     AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
47 | 
48 |     AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
49 |                    (output_grads.scalar_type() == at::ScalarType::BFloat16),
50 |                "Only fp16 and bf16 are supported");
51 |     AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
52 |                    (softmax_results.scalar_type() == at::ScalarType::BFloat16),
53 |                "Only fp16 and bf16 are supported");
54 | 
55 |     return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | }  // end namespace scaled_upper_triang_masked_softmax
59 | }  // end namespace fused_softmax
60 | }  // end namespace multihead_attn
61 | 
62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
63 | {
64 |     m.def("forward",
65 |           &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
66 |           "Self Multihead Attention scaled, time masked softmax -- Forward.");
67 |     m.def("backward",
68 |           &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
69 |           "Self Multihead Attention scaled, time masked softmax -- Backward.");
70 | }
71 | 


--------------------------------------------------------------------------------
/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | import torch
20 | 
21 | 
22 | def ensure_divisibility(numerator, denominator):
23 |     """Ensure that numerator is divisible by the denominator."""
24 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
25 |         numerator, denominator
26 |     )
27 | 
28 | 
29 | def divide(numerator, denominator):
30 |     """Ensure that numerator is divisible by the denominator and return
31 |     the division value."""
32 |     ensure_divisibility(numerator, denominator)
33 |     return numerator // denominator
34 | 
35 | 
36 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
37 |     """Split a tensor along its last dimension.
38 |     Arguments:
39 |         tensor: input tensor.
40 |         num_partitions: number of partitions to split the tensor
41 |         contiguous_split_chunks: If True, make each chunk contiguous
42 |                                  in memory.
43 |     """
44 |     # Get the size and dimension.
45 |     last_dim = tensor.dim() - 1
46 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
47 |     # Split.
48 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
49 |     # Note: torch.split does not create contiguous tensors by default.
50 |     if contiguous_split_chunks:
51 |         return tuple(chunk.contiguous() for chunk in tensor_list)
52 | 
53 |     return tensor_list
54 | 
55 | 
56 | class VocabUtility:
57 |     """Split the vocabulary into `world_size` chunks amd return the
58 |     first and last index of the vocabulary belonging to the `rank`
59 |     partition: Note that indices in [first, last]"""
60 | 
61 |     @staticmethod
62 |     def vocab_range_from_per_partition_vocab_size(
63 |         per_partition_vocab_size, rank, world_size
64 |     ):
65 |         index_f = rank * per_partition_vocab_size
66 |         index_l = index_f + per_partition_vocab_size
67 |         return index_f, index_l
68 | 
69 |     @staticmethod
70 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
71 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
72 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
73 |             per_partition_vocab_size, rank, world_size
74 |         )
75 | 


--------------------------------------------------------------------------------
/megatron/neox_arguments/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NeoX Arguments manages all configuration arguments.
 3 | 
 4 | **general**
 5 | 
 6 | * The implementation makes use of the python dataclass.
 7 | * The main class 'NeoXArgs' (in ./arguments) exposes all configuration attributes that are relevant to GPT NeoX
 8 | * No attributes are nested (apart from attributes with type dict)
 9 | * Output functions (enable_logging, save_yml, print) are implemented
10 | * Instantiation always runs NeoXArgs.__post_init__(), which calculates derived values and performs a validation (values, types, keys).
11 | * it is possible to set undefined attributes (e.g. line of code 'NeoXArgs().my_undefined_config = 42' works fine); such set attributes are not validated
12 | * It is possible to update attributes (e.g. line of code 'NeoXArgs().do_train = True' works fine); a validation can be performed by calling the validation functions on the class instance
13 | * In order to avoid setting undefined attributes you can use the function NeoXArgs().update_value(); this function raises an error if the to be set attribute is not defined
14 | 
15 | **instantiation**
16 | NeoX args can be instantiated with the following options
17 | 
18 | * NeoXArgs.from_ymls(["path_to_yaml1", "path_to_yaml2", ...]): load yaml configuration files and instantiate with the values provided; checks for duplications and unknown arguments are performed
19 | * NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed
20 | 
21 | * NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main
22 | * NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance.
23 | 
24 | 
25 | **code structure**
26 | 
27 | * NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen
28 | * The Subclasses group args according to their purpose
29 | * The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be omitted
30 | * The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be omitted
31 | * calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items())
32 | * refer to docstrings in code for more information
33 | """
34 | 
35 | 
36 | from .arguments import NeoXArgs
37 | 


--------------------------------------------------------------------------------
/configs/alon_config_small.yml:
--------------------------------------------------------------------------------
  1 | {
  2 |   # parallelism settings
  3 |   "pipe-parallel-size": 1,
  4 |   "model-parallel-size": 1,
  5 | 
  6 |   # model settings, 162,322,944 parameters
  7 |   "num-layers": 12,
  8 |   "hidden-size": 768,
  9 |   "num-attention-heads": 12,
 10 |   "seq-length": 1024,
 11 |   "max-position-embeddings": 1024,
 12 |   "pos-emb": "rotary",
 13 |   "rotary-pct": 0.25,
 14 |   "no-weight-tying": true,
 15 |   "gpt-j-residual": true,
 16 |   "output-layer-parallelism": "column",
 17 |   
 18 |   "attention-config": [[["flash"], 12]],
 19 | 
 20 |   "scaled-upper-triang-masked-softmax-fusion": true,
 21 |   "bias-gelu-fusion": true,
 22 | 
 23 |   # batch size (training on 1 gpu, takes up 37.5Gb of memory)
 24 |   "train_batch_size": 480,
 25 |   "train_micro_batch_size_per_gpu": 60,
 26 |   "gradient_accumulation_steps": 8,
 27 |   "data-impl": "mmap",
 28 |   "num_workers": 1,
 29 | 
 30 |   # train settings
 31 |   "train-iters": 100000,
 32 |   "lr-decay-iters": 100000,
 33 |   "distributed-backend": "nccl",
 34 |   "lr-decay-style": "cosine",
 35 |   "warmup": 0.01,
 36 |   "checkpoint-factor": 1000,
 37 |   "eval-interval": 1000,
 38 |   "eval-iters": 10,
 39 | 
 40 |   "log-interval": 10,
 41 |   "steps_per_print": 10,
 42 |   "wall_clock_breakdown": true,
 43 | 
 44 |   # tokenizer settings
 45 |   "tokenizer-type": "GPT2BPETokenizer",
 46 |   "vocab-file": "data/gpt2-vocab.json",
 47 |   "merge-file": "data/gpt2-merges.txt",
 48 | 
 49 |   # wandb settings
 50 |   "use_wandb": true,
 51 |   "wandb_project": "neox",
 52 |   "wandb_group": "alon_config_small",
 53 | 
 54 |   # init methods
 55 |   "init_method": "small_init",
 56 |   "output_layer_init_method": "wang_init",
 57 | 
 58 |   # optimizer settings
 59 |   "optimizer": {
 60 |     "type": "Adam",
 61 |     "params": {
 62 |       "lr": 0.001,
 63 |       "betas": [0.9, 0.95],
 64 |       "eps": 1.0e-8
 65 |     }
 66 |   },
 67 |   "min_lr": 0.0001,
 68 | 
 69 |   "zero_optimization": {
 70 |     "stage": 1,
 71 |     "allgather_partitions": true,
 72 |     "allgather_bucket_size": 500000000,
 73 |     "overlap_comm": true,
 74 |     "reduce_scatter": true,
 75 |     "reduce_bucket_size": 500000000,
 76 |     "contiguous_gradients": true,
 77 |     "cpu_offload": false
 78 |   },
 79 | 
 80 |   # activation checkpointing
 81 |   "checkpoint-activations": true,
 82 |   "checkpoint-num-layers": 1,
 83 |   "partition-activations": true,
 84 |   "synchronize-each-layer": true,
 85 | 
 86 |   # regularization
 87 |   "gradient_clipping": 1.0,
 88 |   "weight-decay": 0.1,
 89 |   "hidden-dropout": 0,
 90 |   "attention-dropout": 0,
 91 | 
 92 |   # precision settings
 93 |   "fp16": {
 94 |     "fp16": true,
 95 |     "enabled": true,
 96 |     "loss_scale": 0,
 97 |     "loss_scale_window": 1000,
 98 |     "initial_scale_power": 12,
 99 |     "hysteresis": 2,
100 |     "min_loss_scale": 1
101 |   },
102 | 
103 |   "train-data-paths": ["data/openwebtext-processed_text_document"],
104 |   "valid-data-paths": ["data/openwebtext-processed_text_document"],
105 |   "test-data-paths": ["data/openwebtext-processed_text_document"],
106 | 
107 |   # to use a single GPU
108 |   # "include": "localhost:7",
109 |   # "world_size": 1,
110 | }


--------------------------------------------------------------------------------
/megatron/model/norms.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from torch.nn import LayerNorm as LayerNorm
17 | 
18 | 
19 | def get_norm(neox_args):
20 |     if neox_args.norm == "rmsnorm":
21 |         norm = RMSNorm
22 |         eps = neox_args.rms_norm_epsilon
23 |     elif neox_args.norm == "layernorm":
24 |         eps = neox_args.layernorm_epsilon
25 |         norm = LayerNorm
26 |     elif neox_args.norm == "scalenorm":
27 |         eps = neox_args.scalenorm_epsilon
28 |         norm = ScaleNorm
29 |     else:
30 |         raise ValueError(f"norm {neox_args.norm} not recognized")
31 |     return norm, eps
32 | 
33 | 
34 | class RMSNorm(torch.nn.Module):
35 |     def __init__(self, dim, p=-1.0, eps=1e-8, bias=False):
36 |         """
37 |             Root Mean Square Layer Normalization
38 |         :param dim: model size
39 |         :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
40 |         :param eps:  epsilon value, default 1e-8
41 |         :param bias: whether use bias term for RMSNorm, disabled by
42 |             default because RMSNorm doesn't enforce re-centering invariance.
43 |         """
44 |         super(RMSNorm, self).__init__()
45 | 
46 |         self.eps = eps
47 |         self.d = dim
48 |         self.p = p
49 |         self.bias = bias
50 | 
51 |         self.scale = torch.nn.Parameter(torch.ones(dim))
52 |         self.register_parameter("scale", self.scale)
53 | 
54 |         if self.bias:
55 |             self.offset = torch.nn.Parameter(torch.zeros(dim))
56 |             self.register_parameter("offset", self.offset)
57 | 
58 |     def forward(self, x):
59 |         if self.p < 0.0 or self.p > 1.0:
60 |             norm_x = x.norm(2, dim=-1, keepdim=True)
61 |             d_x = self.d
62 |         else:
63 |             partial_size = int(self.d * self.p)
64 |             partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
65 | 
66 |             norm_x = partial_x.norm(2, dim=-1, keepdim=True)
67 |             d_x = partial_size
68 | 
69 |         rms_x = norm_x * d_x ** (-1.0 / 2)
70 |         x_normed = x / (rms_x + self.eps)
71 | 
72 |         if self.bias:
73 |             return self.scale * x_normed + self.offset
74 | 
75 |         return self.scale * x_normed
76 | 
77 | 
78 | class ScaleNorm(torch.nn.Module):
79 |     def __init__(self, dim, eps=1e-5):
80 |         super().__init__()
81 |         self.g = torch.nn.Parameter(torch.ones(1))
82 |         self.eps = eps
83 | 
84 |     def forward(self, x):
85 |         n = torch.norm(x, dim=-1, keepdim=True).clamp(min=self.eps)
86 |         return x / n * self.g
87 | 


--------------------------------------------------------------------------------
/configs/gen_docs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | sys.path.append(
 5 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
 6 | )
 7 | from megatron.neox_arguments import neox_args, deepspeed_args
 8 | from inspect import getmembers, getsource
 9 | from dataclasses import field, is_dataclass
10 | from itertools import tee, zip_longest
11 | import pathlib
12 | 
13 | 
14 | def pairwise(iterable):
15 |     "s -> (s0,s1), (s1,s2), (s2, s3), ..."
16 |     a, b = tee(iterable)
17 |     next(b, None)
18 |     return zip_longest(a, b)
19 | 
20 | 
21 | def get_docs(module):
22 |     ARGS_CLASSES = getmembers(module, is_dataclass)
23 |     results = {}
24 |     for name, dcls in ARGS_CLASSES:
25 |         assert is_dataclass(dcls)
26 |         src = getsource(dcls)
27 |         d = dcls()
28 |         loc = 0
29 |         results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
30 |         for cur, _next in pairwise(d.__dataclass_fields__.items()):
31 |             field_name, field_def = cur
32 |             field_type = field_def.type
33 |             if hasattr(field_type, "__name__"):
34 |                 field_type = field_type.__name__
35 |             else:
36 |                 field_type = str(field_type)
37 | 
38 |             field_default = field_def.default
39 | 
40 |             # try to find the field definition
41 |             loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
42 | 
43 |             if _next is not None:
44 |                 next_field_name, _ = _next
45 |                 # try to find the next field definition
46 |                 next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
47 |             else:
48 |                 next_loc = len(src)
49 | 
50 |             # try to get the docstring
51 |             _src = src[loc:next_loc].strip()
52 |             if '"""' in _src:
53 |                 doc = _src.split('"""')[1].strip()
54 |             elif "'''" in _src:
55 |                 doc = _src.split("'''")[1].strip()
56 |             else:
57 |                 doc = ""
58 |             results[name]["attributes"][field_name] = {
59 |                 "name": field_name,
60 |                 "type": field_type,
61 |                 "default": field_default,
62 |                 "doc": doc,
63 |             }
64 |     return results
65 | 
66 | 
67 | def to_md(docs, intro_str=""):
68 |     """
69 |     Writes the docs dictionary to markdown format
70 |     """
71 |     lines = []
72 |     lines.append(intro_str)
73 |     for name, doc in docs.items():
74 |         lines.append(f"## {name}")
75 |         lines.append(f"{doc['doc']}")
76 |         lines.append("")
77 |         for field_name, field_def in doc["attributes"].items():
78 |             # attribute name and type
79 |             lines.append(f"- **{field_name}**: {field_def['type']}")
80 |             # default value
81 |             lines.append(f"    Default = {str(field_def['default'])}")
82 |             lines.append(f"    {field_def['doc']}")
83 |             lines.append("")
84 |     return "\n\n".join(lines)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     docs = get_docs(neox_args)
89 |     docs.update(get_docs(deepspeed_args))
90 |     intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
91 |     md = to_md(docs, intro_str=intro_str)
92 |     with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
93 |         f.write(md)
94 | 


--------------------------------------------------------------------------------