├── tests ├── __init__.py ├── neox_args │ ├── __init__.py │ ├── test_neoxargs_implementation.py │ └── test_neoxargs_usage.py ├── pytest.ini ├── model │ └── __init__.py └── README.md ├── tools ├── kill.sh ├── killall.sh ├── sync_cmd.sh ├── sync.sh ├── syncdir.sh ├── create_run_specific_config.py ├── upload.py └── create_eval_config.py ├── .dockerignore ├── CODEOWNERS ├── megatron ├── data │ ├── __init__.py │ ├── Makefile │ └── blendable_dataset.py ├── gradient_noise_scale │ └── __init__.py ├── tokenizer │ └── __init__.py ├── model │ ├── __init__.py │ ├── fused_bias_dropout.py │ └── norms.py ├── fused_kernels │ ├── compat.h │ ├── __init__.py │ ├── setup.py │ └── scaled_upper_triang_masked_softmax.cpp ├── __init__.py ├── mpu │ ├── random.py │ ├── __init__.py │ └── utils.py └── neox_arguments │ ├── template.py │ └── __init__.py ├── requirements ├── requirements-sparseattention.txt ├── requirements-flashattention.txt ├── requirements-onebitadam.txt ├── requirements-tensorboard.txt ├── requirements-dev.txt └── requirements.txt ├── MANIFEST.in ├── alon_configs ├── gpu │ ├── gpu0.yml │ ├── gpu1.yml │ ├── gpu2.yml │ ├── gpu3.yml │ ├── gpu4.yml │ ├── gpu5.yml │ ├── gpu6.yml │ └── gpu7.yml ├── run_specific │ ├── 1B_doremi_1B.yml │ ├── 1B_original.yml │ ├── 160m_original.yml │ ├── 1B_doremi_280.yml │ ├── 410m_original.yml │ ├── 160m_doremi_1B.yml │ ├── 160m_doremi_280.yml │ ├── 410m_doremi_1B.yml │ └── 410m_doremi_280.yml ├── parallelism.yml ├── init.yml ├── eval_tasks.yml ├── data │ └── openwebtext.yml ├── train_data_weights │ ├── doremi_120.yml │ ├── doremi_1B.yml │ ├── doremi_280.yml │ ├── original_pile.yml │ ├── static_1B_final.yml │ └── static_1B_mean.yml └── models │ ├── eval_1B_1gpu.yml │ ├── eval_1B_seqlen2048_1gpu.yml │ ├── 1B_unnamed_train_datasets.yml │ ├── 3B_unnamed_train_datasets.yml │ ├── 1B_seqlen2048_unnamed_train_datasets.yml │ ├── 410m.yml │ ├── 1B.yml │ ├── 1B_seqlen2048.yml │ ├── 1B_150B_tokens.yml │ ├── eval_3B_1gpu.yml │ ├── eval_3B_seqlen2048_1gpu.yml │ ├── 3B.yml │ ├── eval_410m_1gpu.yml │ ├── eval_1B_1gpu_local1.yml │ ├── eval_1B_1gpu_local2.yml │ ├── eval_1B_1gpu_local3.yml │ ├── eval_1B_1gpu_local4.yml │ ├── eval_1B_1gpu_local5.yml │ ├── 160m.yml │ └── eval_160m_1gpu.yml ├── configs ├── cpu_mock_config.yml ├── slurm_local.yml ├── text_generation.yml ├── sparse.yml ├── eleutherai_cluster.yml ├── local_setup.yml ├── slurm_125M.yml ├── 125M-json.yml ├── gmlp_small.yml ├── 19M.yml ├── 800M.yml ├── 49M.yml ├── bnb_125M.yml ├── bf16_125M.yml ├── 175B.yml ├── 350M.yml ├── 1-3B.yml ├── 2-7B.yml ├── 6-7B.yml ├── 13B.yml ├── 760M.yml ├── 125M.yml ├── alon_config_small.yml └── gen_docs.py ├── example_train_run.sh ├── scripts ├── convert_to_hf.sh ├── train_bigram.sh ├── evaluate_multishot.sh ├── train_doremi_1B.sh ├── train_doremi_280.sh ├── train_original_pile.sh ├── train_410m_pile.sh ├── train_160m_pile.sh ├── train_160m_exp3.sh ├── train_160m_naive_validation.sh ├── train_1B_original_unnamed_train_datasets.sh ├── train_1B_doremi_120_unnamed_train_datasets.sh ├── train_1B_doremi_280_unnamed_train_datasets.sh ├── train_1B_seqlen2048_static_1B_mean.sh ├── train_1B_seqlen2048_static_1B_final.sh ├── train_1B_seqlen2048_doremi_120_unnamed_train_datasets.sh ├── train_1B_seqlen2048_original_unnamed_train_datasets.sh ├── train_1B_exp3.sh ├── train_1B_exp3_mixed_minibatches.sh ├── train_1B_ema.sh ├── train_1B_ema_0.5smoothing.sh ├── train_3B_ema_0.5smoothing.sh ├── train_1B_ema_0.5smoothing_150B_tokens.sh ├── train_1B_seqlen2048_ema_0.5smoothing.sh └── evaluate.sh ├── eval_tasks └── __init__.py ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── cpu_ci.yml │ ├── pull_request.yml │ └── docker_build.yml ├── alon_setup.sh ├── preprocess_pile.sh ├── create_debug_data.sh ├── train.py ├── .pre-commit-config.yaml ├── deepy.py ├── CITATION.cff ├── prepare_data.py └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/kill.sh: -------------------------------------------------------------------------------- 1 | pkill -9 python 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | 20B_checkpoints/ 2 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @EleutherAI/pm-gptneo 2 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /requirements/requirements-sparseattention.txt: -------------------------------------------------------------------------------- 1 | triton==0.4.2 2 | -------------------------------------------------------------------------------- /requirements/requirements-flashattention.txt: -------------------------------------------------------------------------------- 1 | flash-attn==0.2.2 2 | -------------------------------------------------------------------------------- /requirements/requirements-onebitadam.txt: -------------------------------------------------------------------------------- 1 | cupy-cuda111==8.6.0 2 | -------------------------------------------------------------------------------- /requirements/requirements-tensorboard.txt: -------------------------------------------------------------------------------- 1 | tensorboard==2.5.0 2 | -------------------------------------------------------------------------------- /tools/killall.sh: -------------------------------------------------------------------------------- 1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py' 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /megatron/gradient_noise_scale/__init__.py: -------------------------------------------------------------------------------- 1 | from .gradient_noise_scale import GradientNoiseScale 2 | -------------------------------------------------------------------------------- /alon_configs/gpu/gpu0.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:0", 3 | "world_size": 1, 4 | "master_port": 12340, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu1.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:1", 3 | "world_size": 1, 4 | "master_port": 12341, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu2.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:2", 3 | "world_size": 1, 4 | "master_port": 12342, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu3.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:3", 3 | "world_size": 1, 4 | "master_port": 12343, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu4.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:4", 3 | "world_size": 1, 4 | "master_port": 12344, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu5.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:5", 3 | "world_size": 1, 4 | "master_port": 12345, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu6.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:6", 3 | "world_size": 1, 4 | "master_port": 12346, 5 | } -------------------------------------------------------------------------------- /alon_configs/gpu/gpu7.yml: -------------------------------------------------------------------------------- 1 | { 2 | "include": "localhost:7", 3 | "world_size": 1, 4 | "master_port": 12347, 5 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/1B_doremi_1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/1B_doremi_1B", 3 | "wandb_group": "1B_doremi_1B" 4 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/1B_original.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/1B_original", 3 | "wandb_group": "1B_original" 4 | } -------------------------------------------------------------------------------- /alon_configs/parallelism.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings 3 | "pipe-parallel-size": 1, 4 | "model-parallel-size": 1, 5 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/160m_original.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/160m_original", 3 | "wandb_group": "160m_original" 4 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/1B_doremi_280.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/1B_doremi_280", 3 | "wandb_group": "1B_doremi_280" 4 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/410m_original.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/410m_original", 3 | "wandb_group": "410m_original" 4 | } -------------------------------------------------------------------------------- /tests/neox_args/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | testing of implementation of command line arguments and configuration (NeoXArgs) 3 | """ 4 | -------------------------------------------------------------------------------- /alon_configs/init.yml: -------------------------------------------------------------------------------- 1 | { 2 | # init methods 3 | "init_method": "small_init", 4 | "output_layer_init_method": "wang_init", 5 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/160m_doremi_1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/160m_doremi_1B", 3 | "wandb_group": "160m_doremi_1B" 4 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/160m_doremi_280.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/160m_doremi_280", 3 | "wandb_group": "160m_doremi_280" 4 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/410m_doremi_1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/410m_doremi_1B", 3 | "wandb_group": "410m_doremi_1B" 4 | } -------------------------------------------------------------------------------- /alon_configs/run_specific/410m_doremi_280.yml: -------------------------------------------------------------------------------- 1 | { 2 | "save": "outputs/410m_doremi_280", 3 | "wandb_group": "410m_doremi_280" 4 | } -------------------------------------------------------------------------------- /alon_configs/eval_tasks.yml: -------------------------------------------------------------------------------- 1 | { 2 | "eval_tasks": ["lambada_openai", "piqa", "winogrande", "arc_easy", "sciq", "wikitext", "openbookqa"], 3 | } -------------------------------------------------------------------------------- /requirements/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8==1.5.6 2 | clang-format==13.0.1 3 | pre-commit~=2.17.0 4 | pytest==6.2.3 5 | pytest-cov==2.11.1 6 | pytest-forked==1.3.0 7 | pytest-xdist 8 | transformers~=4.16.2 9 | -------------------------------------------------------------------------------- /configs/cpu_mock_config.yml: -------------------------------------------------------------------------------- 1 | # CPU unit tests should be independent of the presence of GPUs on the test server 2 | # host. This configuration mocks these GPU resources and other dependencies. 3 | { 4 | "global_num_gpus": 1 5 | } 6 | -------------------------------------------------------------------------------- /alon_configs/data/openwebtext.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train-data-paths": ["data/openwebtext-processed_text_document"], 3 | "valid-data-paths": ["data/openwebtext-processed_text_document"], 4 | "test-data-paths": ["data/openwebtext-processed_text_document"], 5 | } -------------------------------------------------------------------------------- /example_train_run.sh: -------------------------------------------------------------------------------- 1 | CONFIGS="alon_configs/data/pile_v2.yml alon_configs/models/160.yml alon_configs/init.yml alon_configs/optimizer.yml alon_configs/parallelism.yml" 2 | echo "Running with configs: ${CONFIGS}" 3 | 4 | RUN_NAME="current_run" 5 | python3 deepy.py train.py ${CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log 6 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /configs/slurm_local.yml: -------------------------------------------------------------------------------- 1 | { 2 | "data-path": "data/enron/enron_text_document", 3 | "vocab-file": "data/gpt2-vocab.json", 4 | "merge-file": "data/gpt2-merges.txt", 5 | "save": "checkpoints", 6 | "checkpoint_validation_with_forward_pass": false, 7 | "tensorboard-dir": "tensorboard", 8 | "log-dir": "logs", 9 | "use_wandb": true, 10 | "wandb_host": "https://api.wandb.ai", 11 | "wandb_project": "neox" 12 | } 13 | -------------------------------------------------------------------------------- /alon_configs/train_data_weights/doremi_120.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train_data_weights": [ 3 | 0.1379, 4 | 0.0608, 5 | 0.0757, 6 | 0.1905, 7 | 0.0535, 8 | 0.0325, 9 | 0.038, 10 | 0.0746, 11 | 0.0327, 12 | 0.097, 13 | 0.0292, 14 | 0.0032, 15 | 0.1068, 16 | 0.0019, 17 | 0.0083, 18 | 0.0037, 19 | 0.012, 20 | 0.0084, 21 | 0.0117, 22 | 0.0093, 23 | 0.0084, 24 | 0.004 25 | ] 26 | } -------------------------------------------------------------------------------- /alon_configs/train_data_weights/doremi_1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train_data_weights": [ 3 | 0.1199, 4 | 0.0149, 5 | 0.0739, 6 | 0.3289, 7 | 0.0384, 8 | 0.0129, 9 | 0.0148, 10 | 0.0452, 11 | 0.026, 12 | 0.1461, 13 | 0.025, 14 | 0.0017, 15 | 0.0962, 16 | 0.0004, 17 | 0.0044, 18 | 0.0029, 19 | 0.0078, 20 | 0.0058, 21 | 0.0159, 22 | 0.0063, 23 | 0.0094, 24 | 0.0033 25 | ] 26 | } -------------------------------------------------------------------------------- /alon_configs/train_data_weights/doremi_280.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train_data_weights": [ 3 | 0.6057, 4 | 0.0046, 5 | 0.0224, 6 | 0.1019, 7 | 0.0036, 8 | 0.0179, 9 | 0.0043, 10 | 0.0153, 11 | 0.0036, 12 | 0.0113, 13 | 0.0072, 14 | 0.0047, 15 | 0.0699, 16 | 0.0018, 17 | 0.0093, 18 | 0.0061, 19 | 0.0062, 20 | 0.0134, 21 | 0.0502, 22 | 0.0274, 23 | 0.0063, 24 | 0.007 25 | ] 26 | } -------------------------------------------------------------------------------- /alon_configs/train_data_weights/original_pile.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train_data_weights": [ 3 | 0.1811, 4 | 0.144, 5 | 0.1207, 6 | 0.1001, 7 | 0.0896, 8 | 0.0759, 9 | 0.0612, 10 | 0.0513, 11 | 0.0365, 12 | 0.0307, 13 | 0.0217, 14 | 0.0155, 15 | 0.0153, 16 | 0.0124, 17 | 0.0088, 18 | 0.0075, 19 | 0.0073, 20 | 0.0062, 21 | 0.006, 22 | 0.0038, 23 | 0.003, 24 | 0.0014 25 | ] 26 | } -------------------------------------------------------------------------------- /scripts/convert_to_hf.sh: -------------------------------------------------------------------------------- 1 | # Example usage: 2 | # bash scripts/convert_to_hf.sh 3B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing_seed42 30000 alon_configs/models/3B.yml 3 | 4 | METHOD=$1 5 | STEP=$2 6 | MODEL_CONFIG=$3 7 | 8 | python3 tools/convert_sequential_to_hf.py \ 9 | --input_dir outputs/$METHOD/global_step${STEP} \ 10 | --config_file $MODEL_CONFIG \ 11 | --output_dir outputs/$METHOD/global_step${STEP}/hf_model -------------------------------------------------------------------------------- /alon_configs/train_data_weights/static_1B_final.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train_data_weights": [ 3 | 0.0894, 4 | 0.0316, 5 | 0.0701, 6 | 0.0705, 7 | 0.0267, 8 | 0.0155, 9 | 0.0346, 10 | 0.0353, 11 | 0.0403, 12 | 0.0467, 13 | 0.0669, 14 | 0.0553, 15 | 0.0504, 16 | 0.0168, 17 | 0.0363, 18 | 0.07, 19 | 0.0315, 20 | 0.0604, 21 | 0.0373, 22 | 0.0451, 23 | 0.0466, 24 | 0.0228 25 | ] 26 | } -------------------------------------------------------------------------------- /alon_configs/train_data_weights/static_1B_mean.yml: -------------------------------------------------------------------------------- 1 | { 2 | "train_data_weights": [ 3 | 0.0816, 4 | 0.0346, 5 | 0.0706, 6 | 0.0718, 7 | 0.0293, 8 | 0.0178, 9 | 0.0355, 10 | 0.0332, 11 | 0.0398, 12 | 0.0461, 13 | 0.0664, 14 | 0.0507, 15 | 0.0516, 16 | 0.0157, 17 | 0.0351, 18 | 0.0632, 19 | 0.0323, 20 | 0.0607, 21 | 0.0361, 22 | 0.0549, 23 | 0.0471, 24 | 0.0261 25 | ] 26 | } -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed 2 | einops==0.3.0 3 | ftfy==6.0.1 4 | git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 5 | huggingface_hub==0.11.0 6 | lm_eval==0.3.0 7 | numpy==1.22.0 8 | pybind11==2.6.2 9 | regex 10 | sentencepiece 11 | six 12 | tiktoken==0.1.2 13 | tokenizers==0.12.1 14 | transformers~=4.24.0 15 | wandb==0.10.28 16 | protobuf==3.20.* 17 | best-download 18 | urllib3~=1.26.16 19 | datasets==2.16.0 -------------------------------------------------------------------------------- /scripts/train_bigram.sh: -------------------------------------------------------------------------------- 1 | NUM_TRAIN_SAMPLES=10000000 2 | 3 | OUTPUT_DIR="outputs/bigram_model" 4 | mkdir -p $OUTPUT_DIR 5 | 6 | # get dataset names from path 7 | for f in /share/edc/home/alon_albalak/data/pile/test/*; do 8 | DATASET_NAME=$(basename $f) 9 | DATASET_NAME=${DATASET_NAME%.jsonl} 10 | echo $DATASET_NAME 11 | 12 | python3 bigram_model.py \ 13 | --train \ 14 | --evaluate \ 15 | --dataset_name $DATASET_NAME \ 16 | --train_samples $NUM_TRAIN_SAMPLES \ 17 | > ${OUTPUT_DIR}/${DATASET_NAME}.log 2> ${OUTPUT_DIR}/${DATASET_NAME}.err 18 | 19 | done -------------------------------------------------------------------------------- /configs/text_generation.yml: -------------------------------------------------------------------------------- 1 | # Parameters used for text generation 2 | # Make sure `load` is specified somewhere else 3 | { 4 | # Text gen type: `input-file`, `unconditional` or `interactive` 5 | "text-gen-type": "unconditional", 6 | 7 | # Params for all 8 | "maximum_tokens": 102, 9 | "prompt_end": "\n", 10 | "temperature": 1.0, 11 | "top_p": 0.0, 12 | "top_k": 0, 13 | "recompute": false, 14 | 15 | # `unconditional`: samples 16 | "num-samples": 10, 17 | 18 | # input/output file 19 | "sample-input-file": "sample_input.txt", 20 | "sample-output-file": "sample_output.txt", 21 | } 22 | -------------------------------------------------------------------------------- /configs/sparse.yml: -------------------------------------------------------------------------------- 1 | # Add this to your config for sparse attention every other layer 2 | { 3 | "attention_config": [[["local", "global"], "all"]], 4 | 5 | # sparsity config: 6 | # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for 7 | # illustrative purposes) 8 | # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for 9 | # more detailed config instructions and available parameters 10 | 11 | "sparsity_config": { 12 | "block": 16, # block size 13 | "num_local_blocks": 32, 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /eval_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness 16 | -------------------------------------------------------------------------------- /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [pytest] 16 | markers = 17 | cpu: marks tests that can be run on cpu 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: feature request 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .tokenizer import build_tokenizer 17 | -------------------------------------------------------------------------------- /alon_setup.sh: -------------------------------------------------------------------------------- 1 | conda activate base 2 | conda create --name gptneox python=3.8 3 | conda activate gptneox 4 | conda install pytorch pytorch-cuda=12.1 -c pytorch-nightly -c nvidia 5 | pip install -r requirements/requirements.txt 6 | pip install -r requirements/requirements-flashattention.txt 7 | python ./megatron/fused_kernels/setup.py install 8 | 9 | sed -i 's/from torch._six import inf/from torch import inf/g' ${CONDA_PREFIX}/lib/python3.8/site-packages/deepspeed/runtime/utils.py 10 | sed -i 's/from torch._six import inf/from torch import inf/g' ${CONDA_PREFIX}/lib/python3.8/site-packages/deepspeed/runtime/zero/stage2.py 11 | sed -i 's/from torch._six import inf/from torch import inf/g' ${CONDA_PREFIX}/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py 12 | mkdir outputs -------------------------------------------------------------------------------- /scripts/evaluate_multishot.sh: -------------------------------------------------------------------------------- 1 | RUN_NAME=$1 2 | MODEL_CONFIG_EVAL=$2 3 | STEP=$3 4 | 5 | # MODEL_CONFIG_EVAL is the file name in the alon_configs/models/ directory 6 | # e.g. alon_configs/models/eval_1B_1gpu.yml 7 | # OR alon_configs/models/eval_3B_seqlen2048_1gpu.yml 8 | 9 | # evaluate 0-shot 10 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step${STEP}/configs/${RUN_NAME}.yml alon_configs/models/${MODEL_CONFIG_EVAL}.yml ${STEP} 0 2>&1 | tee outputs/${RUN_NAME}_${STEP}_eval.log & 11 | # evaluate 1-shot through 5-shot 12 | for i in {1..5}; do 13 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step${STEP}/configs/${RUN_NAME}.yml alon_configs/models/${MODEL_CONFIG_EVAL}.yml ${STEP} ${i} ${i} 2>&1 | tee outputs/${RUN_NAME}_${STEP}_${i}shot_eval.log & 14 | done 15 | wait < <(jobs -p) -------------------------------------------------------------------------------- /tests/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .test_model_instantiation import run_test_model_instantiation 16 | from .test_model_train import run_train_test 17 | from .test_model_checkpoint import run_checkpoint_test 18 | -------------------------------------------------------------------------------- /tools/sync_cmd.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Runs a command in parallel across all nodes 18 | # Usage 19 | # sync_cmd.sh 'echo "hello world"' 20 | 21 | echo "Command: $1"; 22 | pdsh -R ssh -w ^/job/hosts $1 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Proposed solution** 24 | If you have an idea for how we can fix this problem, describe it here. 25 | 26 | **Screenshots** 27 | If applicable, add screenshots to help explain your problem. 28 | 29 | **Environment (please complete the following information):** 30 | - GPUs: 31 | - Configs: 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /tools/sync.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # sync.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | echo Uploading $full_path 27 | pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path 28 | done 29 | -------------------------------------------------------------------------------- /.github/workflows/cpu_ci.yml: -------------------------------------------------------------------------------- 1 | name: "Run CPU Tests" 2 | 3 | on: "push" 4 | 5 | jobs: 6 | run-tests: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | 11 | - name: Install Python 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: "3.8" 15 | 16 | - name: Upgrade Pip 17 | run: python -m pip install --upgrade pip 18 | 19 | - name: Install Dependencies 20 | run: | 21 | sudo apt-get install libopenmpi-dev -y 22 | pip install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu 23 | pip install -r requirements/requirements.txt 24 | pip install -r requirements/requirements-dev.txt 25 | 26 | - name: Prepare Data 27 | run: python prepare_data.py 28 | 29 | - name: Run CPU Tests 30 | run: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu 31 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from .gpt2_model import GPT2ModelPipe 19 | from .utils import get_params_for_weight_decay_optimization 20 | from .word_embeddings import SoftEmbedding 21 | -------------------------------------------------------------------------------- /tools/syncdir.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # sync.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | parentdir="$(dirname "$full_path")" 27 | echo Uploading $full_path to $parentdir 28 | pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir 29 | done 30 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied from NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /preprocess_pile.sh: -------------------------------------------------------------------------------- 1 | domains=$1 2 | 3 | for split in train validation test; 4 | do 5 | SPLIT_PATH="/share/edc/home/alon_albalak/data/pile/${split}/" 6 | for DATASET_NAME in ${domains}; 7 | do 8 | DATASET_PATH="${SPLIT_PATH}${DATASET_NAME}.jsonl" 9 | echo "path: dataset path: ${DATASET_PATH}" 10 | echo "name: dataset name: ${DATASET_NAME}" 11 | echo "outputting to: /share/edc/home/alon_albalak/data/pile/preprocessed/${DATASET_NAME}" 12 | 13 | OUTPUT_DIR=/share/edc/home/alon_albalak/data/pile/preprocessed/$split/${DATASET_NAME} 14 | mkdir -p ${OUTPUT_DIR} 15 | 16 | python tools/preprocess_data.py \ 17 | --input $DATASET_PATH \ 18 | --output-prefix ${OUTPUT_DIR}/${DATASET_NAME} \ 19 | --vocab-file /share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json \ 20 | --dataset-impl mmap \ 21 | --tokenizer-type HFTokenizer \ 22 | --workers 24 \ 23 | --append-eod 2>&1 | tee ${OUTPUT_DIR}.log 24 | done 25 | done -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_implementation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | check implementation of NeoXArgs for duplication errors (would overwrite) 17 | """ 18 | import pytest 19 | 20 | 21 | @pytest.mark.cpu 22 | def test_neoxargs_duplicates(): 23 | """ 24 | tests that there are no duplicates among parent classes of NeoXArgs 25 | """ 26 | from megatron import NeoXArgs 27 | 28 | assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates" 29 | -------------------------------------------------------------------------------- /tools/create_run_specific_config.py: -------------------------------------------------------------------------------- 1 | # Simple utility to create a yml config file from command line arguments. 2 | # Handles int, float, bool, and string arguments. 3 | 4 | import sys 5 | import os 6 | import json 7 | 8 | RUN_SPECIFIC_CONFIG_PATH="alon_configs/run_specific" 9 | 10 | print(sys.argv[1:]) 11 | config = {} 12 | for k, v in zip(sys.argv[1::2], sys.argv[2::2]): 13 | k = k.replace("--", "") 14 | # first, handle numeric inputs 15 | try: 16 | f_v = float(v) 17 | if f_v.is_integer(): 18 | v = int(v) 19 | else: 20 | v = float(v) 21 | # if not numeric, then convert bools 22 | except: 23 | if v.lower() == "true": 24 | v = True 25 | elif v.lower() == "false": 26 | v = False 27 | # otherwise, it's a string and we do nothing 28 | config[k] = v 29 | 30 | print(config) 31 | 32 | assert("save" in config) 33 | save_path = os.path.join(RUN_SPECIFIC_CONFIG_PATH, config["save"].split("/")[-1] + ".yml") 34 | with open(save_path, "w") as f: 35 | json.dump(config, f, indent=2) 36 | 37 | -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | 17 | def print_rank_0(*message): 18 | """If distributed is initialized print only on rank 0.""" 19 | if torch.distributed.is_initialized(): 20 | if torch.distributed.get_rank() == 0: 21 | print(*message, flush=True) 22 | else: 23 | print(*message, flush=True) 24 | 25 | 26 | from .initialize import initialize_megatron 27 | from .neox_arguments import NeoXArgs 28 | -------------------------------------------------------------------------------- /scripts/train_doremi_1B.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/train_data_weights/doremi_1B.yml" 3 | # Doesn't include alon_configs/eval_tasks.yml or alon_configs/parallelism.yml 4 | 5 | # RUN SPECIFIC CONFIGS 6 | CONFIGS_160M="alon_configs/models/160m.yml alon_configs/run_specific/160m_doremi_1B.yml" 7 | CONFIGS_410M="alon_configs/models/410m.yml alon_configs/run_specific/410m_doremi_1B.yml" 8 | CONFIGS_1B="alon_configs/models/1B.yml alon_configs/run_specific/1B_doremi_1B.yml" 9 | 10 | echo "Running with configs: ${CONFIGS} ${CONFIGS_160M}" 11 | RUN_NAME="160m_doremi_1B" 12 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_160M} 2>&1 | tee outputs/${RUN_NAME}.log 13 | 14 | echo "Running with configs: ${CONFIGS} ${CONFIGS_410M}" 15 | RUN_NAME="410m_doremi_1B" 16 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_410M} 2>&1 | tee outputs/${RUN_NAME}.log 17 | 18 | echo "Running with configs: ${CONFIGS} ${CONFIGS_1B}" 19 | RUN_NAME="1B_doremi_1B" 20 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_1B} 2>&1 | tee outputs/${RUN_NAME}.log -------------------------------------------------------------------------------- /scripts/train_doremi_280.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/train_data_weights/doremi_280.yml" 3 | # Doesn't include alon_configs/eval_tasks.yml or alon_configs/parallelism.yml 4 | 5 | # RUN SPECIFIC CONFIGS 6 | CONFIGS_160M="alon_configs/models/160m.yml alon_configs/run_specific/160m_doremi_280.yml" 7 | CONFIGS_410M="alon_configs/models/410m.yml alon_configs/run_specific/410m_doremi_280.yml" 8 | CONFIGS_1B="alon_configs/models/1B.yml alon_configs/run_specific/1B_doremi_280.yml" 9 | 10 | echo "Running with configs: ${CONFIGS} ${CONFIGS_160M}" 11 | RUN_NAME="160m_doremi_280" 12 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_160M} 2>&1 | tee outputs/${RUN_NAME}.log 13 | 14 | echo "Running with configs: ${CONFIGS} ${CONFIGS_410M}" 15 | RUN_NAME="410m_doremi_280" 16 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_410M} 2>&1 | tee outputs/${RUN_NAME}.log 17 | 18 | echo "Running with configs: ${CONFIGS} ${CONFIGS_1B}" 19 | RUN_NAME="1B_doremi_280" 20 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_1B} 2>&1 | tee outputs/${RUN_NAME}.log -------------------------------------------------------------------------------- /scripts/train_original_pile.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/eval_tasks.yml or alon_configs/parallelism.yml 4 | 5 | # RUN SPECIFIC CONFIGS 6 | CONFIGS_160M="alon_configs/models/160m.yml alon_configs/run_specific/160m_original.yml" 7 | CONFIGS_410M="alon_configs/models/410m.yml alon_configs/run_specific/410m_original.yml" 8 | CONFIGS_1B="alon_configs/models/1B.yml alon_configs/run_specific/1B_original.yml" 9 | 10 | echo "Running with configs: ${CONFIGS} ${CONFIGS_160M}" 11 | RUN_NAME="160m_original" 12 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_160M} 2>&1 | tee outputs/${RUN_NAME}.log 13 | 14 | echo "Running with configs: ${CONFIGS} ${CONFIGS_410M}" 15 | RUN_NAME="410m_original" 16 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_410M} 2>&1 | tee outputs/${RUN_NAME}.log 17 | 18 | echo "Running with configs: ${CONFIGS} ${CONFIGS_1B}" 19 | RUN_NAME="1B_original" 20 | python3 deepy.py train.py ${CONFIGS} ${CONFIGS_1B} 2>&1 | tee outputs/${RUN_NAME}.log 21 | -------------------------------------------------------------------------------- /create_debug_data.sh: -------------------------------------------------------------------------------- 1 | SHARE_DIR=/share/edc/home/alon_albalak/data/pile/debug 2 | mkdir -p ${SHARE_DIR} 3 | 4 | for split in train validation test; do 5 | mkdir -p ${SHARE_DIR}/$split 6 | mkdir -p ${SHARE_DIR}/preprocessed/$split 7 | for DATASET_FILE in $(ls /share/edc/home/alon_albalak/data/pile/$split); do 8 | DATASET_NAME=${DATASET_FILE::-6} 9 | echo ${DATASET_FILE} 10 | echo ${DATASET_NAME} 11 | head -n 200 /share/edc/home/alon_albalak/data/pile/$split/${DATASET_FILE} > ${SHARE_DIR}/$split/${DATASET_FILE} 12 | 13 | OUTPUT_DIR=${SHARE_DIR}/preprocessed/$split/${DATASET_NAME} 14 | mkdir -p ${OUTPUT_DIR} 15 | 16 | python tools/preprocess_data.py \ 17 | --input ${SHARE_DIR}/$split/${DATASET_FILE} \ 18 | --output-prefix ${OUTPUT_DIR}/${DATASET_NAME} \ 19 | --vocab-file /share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json \ 20 | --dataset-impl mmap \ 21 | --tokenizer-type HFTokenizer \ 22 | --append-eod 23 | # 2>&1 | tee /share/edc/home/alon_albalak/data/pile/preprocessed/${DATASET_NAME}.log 24 | done 25 | done -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Pull Request 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-python@v2 11 | with: 12 | python-version: 3.8 13 | - uses: pre-commit/action@v2.0.3 14 | 15 | update-documentation: 16 | runs-on: ubuntu-20.04 17 | steps: 18 | - uses: actions/checkout@v3 19 | with: 20 | ref: ${{ github.event.pull_request.head.ref}} 21 | - run: | 22 | rm megatron/__init__.py 23 | pip install shortuuid 24 | rm megatron/neox_arguments/__init__.py 25 | python configs/gen_docs.py 26 | git config user.name github-actions 27 | git config user.email github-actions@github.com 28 | git add configs/neox_arguments.md 29 | git commit -m "Update NeoXArgs docs automatically" 30 | git push 31 | run-tests: 32 | runs-on: self-hosted 33 | steps: 34 | - uses: actions/checkout@v2 35 | - name: prepare data 36 | run: python prepare_data.py 37 | - name: Run Tests 38 | run: pytest --forked tests 39 | -------------------------------------------------------------------------------- /scripts/train_410m_pile.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/models/410m.yml alon_configs/init.yml alon_configs/optimizer.yml" 3 | # Doesn't include alon_configs/eval_tasks.yml or alon_configs/parallelism.yml 4 | 5 | # RUN SPECIFIC CONFIGS 6 | ORIGINAL_WEIGHT_CONFIGS="alon_configs/train_data_weights/original_pile.yml alon_configs/run_specific/410m_original.yml" 7 | DOREMI_280_CONFIGS="alon_configs/train_data_weights/doremi_280.yml alon_configs/run_specific/410m_doremi_280.yml" 8 | DOREMI_1B_CONFIGS="alon_configs/train_data_weights/doremi_1B.yml alon_configs/run_specific/410m_doremi_1B.yml" 9 | 10 | echo "Running with configs: ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS}" 11 | RUN_NAME="410m_original" 12 | python3 deepy.py train.py ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log 13 | 14 | echo "Running with configs: ${CONFIGS} ${DOREMI_280_CONFIGS}" 15 | RUN_NAME="410m_doremi_280" 16 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_280_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log 17 | 18 | echo "Running with configs: ${CONFIGS} ${DOREMI_1B_CONFIGS}" 19 | RUN_NAME="410m_doremi_1B" 20 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_1B_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log -------------------------------------------------------------------------------- /scripts/train_160m_pile.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/models/160m.yml alon_configs/init.yml alon_configs/optimizer.yml" 3 | # Doesn't include alon_configs/eval_tasks.yml or alon_configs/parallelism.yml 4 | 5 | 6 | # RUN SPECIFIC CONFIGS 7 | ORIGINAL_WEIGHT_CONFIGS="alon_configs/train_data_weights/original_pile.yml alon_configs/run_specific/160m_original.yml" 8 | DOREMI_280_CONFIGS="alon_configs/train_data_weights/doremi_280.yml alon_configs/run_specific/160m_doremi_280.yml" 9 | DOREMI_1B_CONFIGS="alon_configs/train_data_weights/doremi_1B.yml alon_configs/run_specific/160m_doremi_1B.yml" 10 | 11 | echo "Running with configs: ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS}" 12 | RUN_NAME="160m_original" 13 | python3 deepy.py train.py ${CONFIGS} ${ORIGINAL_WEIGHT_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log 14 | 15 | echo "Running with configs: ${CONFIGS} ${DOREMI_280_CONFIGS}" 16 | RUN_NAME="160m_doremi_280" 17 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_280_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | echo "Running with configs: ${CONFIGS} ${DOREMI_1B_CONFIGS}" 20 | RUN_NAME="160m_doremi_1B" 21 | python3 deepy.py train.py ${CONFIGS} ${DOREMI_1B_CONFIGS} 2>&1 | tee outputs/${RUN_NAME}.log -------------------------------------------------------------------------------- /configs/eleutherai_cluster.yml: -------------------------------------------------------------------------------- 1 | # Data paths and options when using EleutherAI cluster 2 | { 3 | # you may include multiple distinct datasets if desired 4 | "train-data-paths": ["/mnt/ssd-1/data/enron/enron_train_text_document"], 5 | "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_val_text_document"], 6 | "test-data-paths": ["/mnt/ssd-1/data/enron/enron_test_text_document"], 7 | 8 | # if using multiple datasets, provide weights for them to be sampled with 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | 14 | # If you would like the code to create val and test datasets from your training set use the following instead 15 | # "split" determines the relative size of train, val, and test 16 | 17 | # "split" 995,4,1 18 | # "data_path": "/mnt/ssd-1/data/enron/enron_train_text_document", 19 | 20 | "vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json", 21 | "merge-file": "/mnt/ssd-1/data/gpt2-merges.txt", 22 | "save": "/mnt/ssd-1/checkpoints", 23 | "load": "/mnt/ssd-1/checkpoints", 24 | "tensorboard-dir": "/mnt/ssd-1/tensorboard", 25 | "log-dir": "/mnt/ssd-1/logs", 26 | "wandb_team": "eleutherai", 27 | "wandb_project": "neox", 28 | "wandb_group": "example" 29 | } 30 | -------------------------------------------------------------------------------- /scripts/train_160m_exp3.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/160m.yml alon_configs/eval_tasks.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="160m_ods" 6 | DATA_SAMPLING_METHOD="exp3" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | 10 | SEEDS=(1234 42 100 222) 11 | 12 | # RUN SPECIFIC CONFIGS 13 | for SEED in ${SEEDS[@]}; do 14 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 15 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY}" 16 | python3 tools/create_run_specific_config.py ${ARGS} 17 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 18 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 19 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 20 | 21 | # evaluate 22 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_160m_1gpu.yml 23 | done -------------------------------------------------------------------------------- /configs/local_setup.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data-path": "data/enron/enron_text_document", 4 | 5 | # or for weighted datasets: 6 | # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 7 | # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 8 | # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"], 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 14 | # WARNING: setting this to True will override any user provided weights 15 | # "weight_by_num_documents": false, 16 | # "weighted_sampler_alpha": 0.3, 17 | 18 | "vocab-file": "data/gpt2-vocab.json", 19 | "merge-file": "data/gpt2-merges.txt", 20 | 21 | "save": "checkpoints", 22 | "load": "checkpoints", 23 | "checkpoint_validation_with_forward_pass": False, 24 | 25 | "tensorboard-dir": "tensorboard", 26 | "log-dir": "logs", 27 | "use_wandb": True, 28 | "wandb_host": "https://api.wandb.ai", 29 | "wandb_project": "neox" 30 | } 31 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Train""" 19 | from megatron.neox_arguments import NeoXArgs 20 | from megatron.training import pretrain 21 | 22 | if __name__ == "__main__": 23 | neox_args = NeoXArgs.consume_neox_args() 24 | neox_args.configure_distributed_args() 25 | neox_args.build_tokenizer() # tokenizer needs to be build in training in order to set the padding vocab 26 | neox_args.initialize_tensorboard_writer() # is initialized if tensorboard directory is defined 27 | pretrain(neox_args=neox_args) 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-json 7 | - id: check-symlinks 8 | - id: check-yaml 9 | - id: destroyed-symlinks 10 | - id: end-of-file-fixer 11 | exclude: docs/CNAME 12 | - id: fix-byte-order-marker 13 | - id: fix-encoding-pragma 14 | args: [--remove] 15 | - id: mixed-line-ending 16 | args: [--fix=lf] 17 | - id: requirements-txt-fixer 18 | - id: trailing-whitespace 19 | - repo: https://gitlab.com/daverona/pre-commit-cpp 20 | rev: 0.8.0 21 | hooks: 22 | - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available 23 | args: [] 24 | 25 | - repo: https://github.com/psf/black 26 | rev: 22.3.0 27 | hooks: 28 | - id: black 29 | language_version: python3 30 | - repo: https://github.com/codespell-project/codespell 31 | rev: v2.1.0 32 | hooks: 33 | - id: codespell 34 | args: [ 35 | '--ignore-words-list=reord,dout', # Word used in error messages that need rewording 36 | --check-filenames, 37 | --check-hidden, 38 | ] 39 | -------------------------------------------------------------------------------- /deepy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2021, EleutherAI 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | 19 | import deepspeed 20 | from deepspeed.launcher.runner import main 21 | 22 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) 23 | 24 | from megatron.neox_arguments import NeoXArgs 25 | from megatron.utils import get_wandb_api_key 26 | 27 | 28 | neox_args = NeoXArgs.consume_deepy_args() 29 | deepspeed_main_args = neox_args.get_deepspeed_main_args() 30 | 31 | # Extract wandb API key and inject into worker environments 32 | wandb_token = get_wandb_api_key(neox_args=neox_args) 33 | if wandb_token is not None: 34 | deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY") 35 | os.environ["WANDB_API_KEY"] = wandb_token 36 | 37 | if __name__ == "__main__": 38 | main(deepspeed_main_args) 39 | -------------------------------------------------------------------------------- /.github/workflows/docker_build.yml: -------------------------------------------------------------------------------- 1 | name: docker_build 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | main: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - 13 | name: Checkout 14 | uses: actions/checkout@v2 15 | 16 | - 17 | name: Docker meta 18 | id: docker_meta 19 | uses: crazy-max/ghaction-docker-meta@v1 20 | with: 21 | images: leogao2/gpt-neox # list of Docker images to use as base name for tags 22 | tag-sha: true # add git short SHA as Docker tag 23 | 24 | - 25 | name: Set up QEMU 26 | uses: docker/setup-qemu-action@v1 27 | 28 | - 29 | name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v1 31 | 32 | - 33 | name: Login to DockerHub 34 | uses: docker/login-action@v1 35 | with: 36 | username: ${{ secrets.DOCKERHUB_USERNAME }} 37 | password: ${{ secrets.DOCKERHUB_TOKEN }} 38 | 39 | - 40 | name: Build and push 41 | id: docker_build 42 | uses: docker/build-push-action@v2 43 | with: 44 | push: ${{ github.event_name != 'pull_request' }} 45 | tags: ${{ steps.docker_meta.outputs.tags }} 46 | labels: ${{ steps.docker_meta.outputs.labels }} 47 | 48 | - 49 | name: Image digest 50 | run: echo ${{ steps.docker_build.outputs.digest }} 51 | -------------------------------------------------------------------------------- /scripts/train_160m_naive_validation.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/160m.yml alon_configs/eval_tasks.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="160m_ods_naive_validation_10" 6 | DATA_SAMPLING_METHOD="naive_validation" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="10" 9 | MIXED_BATCHES=true 10 | VALIDATION_BASED_REWARD=true 11 | 12 | SEEDS=(1234 42 100 222) 13 | 14 | # RUN SPECIFIC CONFIGS 15 | for SEED in ${SEEDS[@]}; do 16 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 17 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_batches ${MIXED_BATCHES} --validation_based_reward ${VALIDATION_BASED_REWARD}" 18 | python3 tools/create_run_specific_config.py ${ARGS} 19 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 20 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 21 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 22 | 23 | # evaluate 24 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_160m_1gpu.yml 25 | done -------------------------------------------------------------------------------- /scripts/train_1B_original_unnamed_train_datasets.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_original_unnamed_train_datasets" 6 | 7 | SEEDS=(1234 42 100 222) 8 | 9 | # RUN SPECIFIC CONFIGS 10 | for SEED in ${SEEDS[@]}; do 11 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 12 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 13 | python3 tools/create_run_specific_config.py ${ARGS} 14 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 15 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 16 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 17 | 18 | # evaluate 0-shot 19 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 20 | # evaluate 1-shot through 5-shot 21 | for i in {1..5}; do 22 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 23 | done 24 | wait < <(jobs -p) 25 | done -------------------------------------------------------------------------------- /scripts/train_1B_doremi_120_unnamed_train_datasets.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/doremi_120.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_doremi_120_unnamed_train_datasets" 6 | 7 | # SEEDS=(1234 42 100 222) 8 | SEEDS=( 42 ) 9 | 10 | # RUN SPECIFIC CONFIGS 11 | for SEED in ${SEEDS[@]}; do 12 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 13 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 14 | python3 tools/create_run_specific_config.py ${ARGS} 15 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 16 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 17 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | # evaluate 0-shot 20 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 21 | # evaluate 1-shot through 5-shot 22 | for i in {1..5}; do 23 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 24 | done 25 | wait < <(jobs -p) 26 | done -------------------------------------------------------------------------------- /scripts/train_1B_doremi_280_unnamed_train_datasets.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/doremi_280.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_doremi_280_unnamed_train_datasets" 6 | 7 | SEEDS=(1234 42 100 222) 8 | # SEEDS=( 1234 222 ) 9 | 10 | # RUN SPECIFIC CONFIGS 11 | for SEED in ${SEEDS[@]}; do 12 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 13 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 14 | python3 tools/create_run_specific_config.py ${ARGS} 15 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 16 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 17 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | # evaluate 0-shot 20 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 21 | # evaluate 1-shot through 5-shot 22 | for i in {1..5}; do 23 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 24 | done 25 | wait < <(jobs -p) 26 | done -------------------------------------------------------------------------------- /scripts/train_1B_seqlen2048_static_1B_mean.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/static_1B_mean.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_seqlen2048_static_1B_mean" 6 | 7 | # SEEDS=(1234 42 100 222) 8 | SEEDS=( 42 ) 9 | 10 | # RUN SPECIFIC CONFIGS 11 | for SEED in ${SEEDS[@]}; do 12 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 13 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 14 | python3 tools/create_run_specific_config.py ${ARGS} 15 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 16 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 17 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | # # evaluate 0-shot 20 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 21 | # # evaluate 1-shot through 5-shot 22 | # for i in {1..5}; do 23 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 24 | # done 25 | # wait < <(jobs -p) 26 | done -------------------------------------------------------------------------------- /scripts/train_1B_seqlen2048_static_1B_final.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/static_1B_final.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_seqlen2048_static_1B_final" 6 | 7 | # SEEDS=(1234 42 100 222) 8 | SEEDS=( 42 ) 9 | 10 | # RUN SPECIFIC CONFIGS 11 | for SEED in ${SEEDS[@]}; do 12 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 13 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 14 | python3 tools/create_run_specific_config.py ${ARGS} 15 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 16 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 17 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | # # evaluate 0-shot 20 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 21 | # # evaluate 1-shot through 5-shot 22 | # for i in {1..5}; do 23 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 24 | # done 25 | # wait < <(jobs -p) 26 | done -------------------------------------------------------------------------------- /scripts/train_1B_seqlen2048_doremi_120_unnamed_train_datasets.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/doremi_120.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_seqlen2048_doremi_120_unnamed_train_datasets" 6 | 7 | # SEEDS=(1234 42 100 222) 8 | SEEDS=( 42 ) 9 | 10 | # RUN SPECIFIC CONFIGS 11 | for SEED in ${SEEDS[@]}; do 12 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 13 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 14 | python3 tools/create_run_specific_config.py ${ARGS} 15 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 16 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 17 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | # # evaluate 0-shot 20 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 21 | # # evaluate 1-shot through 5-shot 22 | # for i in {1..5}; do 23 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 24 | # done 25 | # wait < <(jobs -p) 26 | done -------------------------------------------------------------------------------- /scripts/train_1B_seqlen2048_original_unnamed_train_datasets.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_seqlen2048_original_unnamed_train_datasets" 6 | 7 | # SEEDS=(1234 42 100 222) 8 | SEEDS=( 42 ) 9 | 10 | # RUN SPECIFIC CONFIGS 11 | for SEED in ${SEEDS[@]}; do 12 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 13 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED}" 14 | python3 tools/create_run_specific_config.py ${ARGS} 15 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 16 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 17 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 18 | 19 | # # evaluate 0-shot 20 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 21 | # # evaluate 1-shot through 5-shot 22 | # for i in {1..5}; do 23 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 24 | # done 25 | # wait < <(jobs -p) 26 | done -------------------------------------------------------------------------------- /scripts/train_1B_exp3.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_ods" 6 | DATA_SAMPLING_METHOD="exp3" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | 10 | # SEEDS=(1234 42 100 222) 11 | SEEDS=( 1234 ) 12 | 13 | # RUN SPECIFIC CONFIGS 14 | for SEED in ${SEEDS[@]}; do 15 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 16 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY}" 17 | python3 tools/create_run_specific_config.py ${ARGS} 18 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 19 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 20 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 21 | 22 | # evaluate 0-shot 23 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log 24 | # evaluate 5-shot 25 | NUM_SHOTS=5 26 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml ${NUM_SHOTS} 2>&1 | tee outputs/${RUN_NAME}_${NUM_SHOTS}shot_eval.log 27 | done -------------------------------------------------------------------------------- /megatron/mpu/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports 16 | # TODO: should be able to get rid of this file entirely 17 | 18 | import deepspeed 19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing 20 | 21 | # Default name for the model parallel rng tracker. 22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = ( 23 | deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME 24 | ) 25 | 26 | # Whether apply model parallelsim to checkpointed hidden states. 27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None 28 | 29 | # RNG tracker object. 30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER 31 | 32 | # Deepspeed checkpointing functions 33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones 34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state 35 | checkpoint = checkpointing.checkpoint 36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed 37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker 38 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | 3 | Tests use pytests with coverage and forked plugins. Install with: 4 | 5 | ```bash 6 | pip install -r requirements/requirements-dev.txt 7 | ``` 8 | 9 | Download the required test data 10 | ```bash 11 | python prepare_data.py 12 | ``` 13 | 14 | # Run 15 | 16 | Tests can be run using pytest. 17 | 18 | * The argument --forked needs to be provided 19 | * A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation) 20 | * A subset of tests can be selected by pointing to the module within tests 21 | 22 | ```bash 23 | # run all tests, output coverage report of megatron module in terminal 24 | pytest --forked --cov-report term --cov=megatron tests 25 | 26 | # run tests in tests/model, output coverage report of megatron module as html 27 | pytest --forked --cov-report html --cov=megatron tests/model 28 | 29 | # run tests in tests/model/test_model_generation.py, don't output coverage report 30 | pytest --forked tests/model/test_model_generation.py 31 | ``` 32 | 33 | Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu. 34 | The test cases for cpu can be run with: 35 | ```` 36 | pytest tests -m cpu 37 | ``` 38 | 39 | If a html coverage report has been created a simple http server can be run to serve static files. 40 | 41 | ```bash 42 | python -m http.server --directory htmlcov 8000 43 | ``` 44 | 45 | 46 | ## Tips and Tricks 47 | if You see this kind of error: 48 | ``` 49 | RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method 50 | ``` 51 | It means that you used some pytorch.cuda function before the test creates the processes. 52 | -------------------------------------------------------------------------------- /megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pathlib 17 | import subprocess 18 | 19 | from torch.utils import cpp_extension 20 | from pathlib import Path 21 | 22 | srcpath = Path(__file__).parent.absolute() 23 | 24 | # Setting this param to a list has a problem of generating different 25 | # compilation commands (with different order of architectures) and 26 | # leading to recompilation of fused kernels. Set it to empty string 27 | # to avoid recompilation and assign arch flags explicitly in 28 | # extra_cuda_cflags below 29 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 30 | 31 | 32 | def load_fused_kernels(): 33 | try: 34 | import scaled_upper_triang_masked_softmax_cuda 35 | import scaled_masked_softmax_cuda 36 | except (ImportError, ModuleNotFoundError): 37 | print("\n") 38 | print("=" * 100) 39 | print( 40 | f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them' 41 | ) 42 | print("=" * 100) 43 | exit() 44 | return 45 | -------------------------------------------------------------------------------- /scripts/train_1B_exp3_mixed_minibatches.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_ods_mixed_minibatches_original_weights_init" 6 | DATA_SAMPLING_METHOD="exp3" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | MIXED_MINIBATCHES=true 10 | 11 | # SEEDS=(1234 42 100 222) 12 | SEEDS=(42) 13 | 14 | # RUN SPECIFIC CONFIGS 15 | for SEED in ${SEEDS[@]}; do 16 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 17 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES}" 18 | python3 tools/create_run_specific_config.py ${ARGS} 19 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 20 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 21 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 22 | 23 | # evaluate 0-shot 24 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log 25 | # evaluate 5-shot 26 | NUM_SHOTS=5 27 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml ${NUM_SHOTS} 2>&1 | tee outputs/${RUN_NAME}_${NUM_SHOTS}shot_eval.log 28 | done -------------------------------------------------------------------------------- /scripts/train_1B_ema.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_ods_smoothed_mean_mixed_minibatches_original_weights_init" 6 | DATA_SAMPLING_METHOD="smoothed_mean" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | MIXED_MINIBATCHES=true 10 | 11 | # SEEDS=(1234 42 100 222) 12 | SEEDS=(42) 13 | 14 | # RUN SPECIFIC CONFIGS 15 | for SEED in ${SEEDS[@]}; do 16 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 17 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES}" 18 | python3 tools/create_run_specific_config.py ${ARGS} 19 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 20 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 21 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 22 | 23 | # evaluate 0-shot 24 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log 25 | # evaluate 5-shot 26 | NUM_SHOTS=5 27 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml ${NUM_SHOTS} 2>&1 | tee outputs/${RUN_NAME}_${NUM_SHOTS}shot_eval.log 28 | done -------------------------------------------------------------------------------- /tools/upload.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | 18 | from huggingface_hub import HfApi, create_repo 19 | 20 | converted_ckpt = sys.argv[1] 21 | repo_name = sys.argv[2] 22 | branch_name = sys.argv[3] 23 | try: 24 | create_repo(repo_name, repo_type="model", private=False) 25 | except: 26 | print("repo {repo_name} already exists!") 27 | pass 28 | 29 | files = os.listdir(converted_ckpt) 30 | 31 | api = HfApi() 32 | if branch_name != "main": 33 | try: 34 | api.create_branch( 35 | repo_id=repo_name, 36 | repo_type="model", 37 | branch=branch_name, 38 | ) 39 | except: 40 | print(f"branch {branch_name} already exists, try again...") 41 | print(f"to upload: {files}") 42 | for file in files: 43 | print(f"Uploading {file} to branch {branch_name}...") 44 | api.upload_file( 45 | path_or_fileobj=os.path.join(converted_ckpt, file), 46 | path_in_repo=file, 47 | repo_id=repo_name, 48 | repo_type="model", 49 | commit_message=f"Upload {file}", 50 | revision=branch_name, 51 | ) 52 | print(f"Successfully uploaded {file} !") -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # YAML 1.2 2 | --- 3 | authors: 4 | - affiliation: EleutherAI 5 | family-names: Andonian 6 | given-names: Alex 7 | - affiliation: EleutherAI 8 | family-names: Biderman 9 | given-names: Stella 10 | - affiliation: EleutherAI 11 | family-names: Black 12 | given-names: Sid 13 | - affiliation: EleutherAI 14 | family-names: Gali 15 | given-names: Preetham 16 | - affiliation: EleutherAI 17 | family-names: Gao 18 | given-names: Leo 19 | - affiliation: EleutherAI 20 | family-names: Hallahan 21 | given-names: Eric 22 | - affiliation: EleutherAI 23 | family-names: Levy-Kramer 24 | given-names: Josh 25 | - affiliation: EleutherAI 26 | family-names: Leahy 27 | given-names: Connor 28 | - affiliation: EleutherAI 29 | family-names: Nestler 30 | given-names: Lucas 31 | - affiliation: EleutherAI 32 | family-names: Parker 33 | given-names: Kip 34 | - affiliation: EleutherAI 35 | family-names: Pieler 36 | given-names: Michael 37 | - affiliation: EleutherAI 38 | family-names: Purohit 39 | given-names: Shivanshu 40 | - affiliation: EleutherAI 41 | family-names: Songz 42 | given-names: Tri 43 | - affiliation: EleutherAI 44 | family-names: Phil 45 | given-names: Wang 46 | - affiliation: EleutherAI 47 | family-names: Weinbach 48 | given-names: Samuel 49 | cff-version: "1.1.0" 50 | keywords: 51 | - "Transformers" 52 | - "Massive language model" 53 | - "Autoregressive language model" 54 | license: "Apache-2.0" 55 | message: "If you use this software, please cite it using these metadata." 56 | repository-code: "https://www.github.com/eleutherai/gpt-neox" 57 | title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch" 58 | version: "0.0.1" 59 | doi: "10.5281/zenodo.5879544" 60 | date-released: 2021-08-23 61 | ... 62 | -------------------------------------------------------------------------------- /tools/create_eval_config.py: -------------------------------------------------------------------------------- 1 | # Simple utility that takes an existing yml config file and adds evaluation specific parameters. 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | def get_step_from_path(path): 8 | top_dir = path.split("/")[-1] 9 | if "global_step" in top_dir: 10 | return int(top_dir.split("global_step")[-1]) 11 | else: 12 | return get_step_from_path("/".join(path.split("/")[:-1])) 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--config_path", type=str, required=True, help="Path to the config file to modify.") 16 | parser.add_argument("--num_fewshot", type=int, required=False, default=0, help="Flag fo the number of fewshot in-context examples to use. 0 if none.") 17 | parser.add_argument("--iteration", type=int, required=False, default=None, help="Iteration of the model to evaluate. If not specified, will use the latest checkpoint.") 18 | 19 | args = parser.parse_args() 20 | model_step = get_step_from_path(args.config_path) 21 | 22 | with open(args.config_path, "r") as f: 23 | config = json.load(f) 24 | 25 | if "seed" not in config.keys(): 26 | config["seed"] = 1234 27 | 28 | config["wandb_run_name"] = f"seed{config['seed']}_eval" 29 | 30 | eval_results_prefix = os.path.join(config["save"], f"step{model_step}") 31 | config['load'] = config['save'] 32 | config['eval_results_prefix'] = eval_results_prefix 33 | 34 | save_path = args.config_path.replace(".yml", "_eval.yml") 35 | 36 | # if using iteration 37 | if args.iteration is not None: 38 | config["iteration"] = args.iteration 39 | 40 | # if using num_fewshot 41 | if args.num_fewshot > 0: 42 | config["eval_num_fewshot"] = args.num_fewshot 43 | config["wandb_run_name"] += f"_{args.num_fewshot}shot" 44 | save_path = save_path.replace(".yml", f"_{args.num_fewshot}shot.yml") 45 | 46 | with open(save_path, "w") as f: 47 | json.dump(config, f, indent=2) -------------------------------------------------------------------------------- /megatron/neox_arguments/template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | import logging 17 | 18 | 19 | @dataclass 20 | class NeoXArgsTemplate: 21 | def defaults(self): 22 | """ 23 | generator for getting default values. 24 | """ 25 | for key, field_def in self.__dataclass_fields__.items(): 26 | yield key, field_def.default 27 | 28 | def update_value(self, key: str, value): 29 | """ 30 | updates a property value if the key already exists 31 | 32 | Problem: a previously non-existing property can be added to the class instance without error. 33 | """ 34 | if hasattr(self, key): 35 | setattr(self, key, value) 36 | else: 37 | error_message = ( 38 | self.__class__.__name__ 39 | + ".update_value() to be updated property " 40 | + str(key) 41 | + " does not exist" 42 | ) 43 | logging.error(error_message) 44 | raise ValueError(error_message) 45 | 46 | def update_values(self, d): 47 | """ 48 | Updates multiple values in self if the keys already exists 49 | """ 50 | for k, v in d.items(): 51 | self.update_value(k, v) 52 | -------------------------------------------------------------------------------- /configs/slurm_125M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | "num-layers": 12, 5 | "hidden-size": 768, 6 | "num-attention-heads": 12, 7 | "seq-length": 2048, 8 | "max-position-embeddings": 2048, 9 | "norm": "layernorm", 10 | "pos-emb": "rotary", 11 | "no-weight-tying": true, 12 | "scaled-upper-triang-masked-softmax-fusion": true, 13 | "bias-gelu-fusion": true, 14 | "optimizer": { 15 | "type": "Adam", 16 | "params": { 17 | "lr": 0.0006, 18 | "betas": [0.9, 0.999], 19 | "eps": 1.0e-8 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 0, 24 | "allgather_partitions": true, 25 | "allgather_bucket_size": 500000000, 26 | "overlap_comm": true, 27 | "reduce_scatter": true, 28 | "reduce_bucket_size": 500000000, 29 | "contiguous_gradients": true 30 | }, 31 | "train_micro_batch_size_per_gpu": 4, 32 | "data-impl": "mmap", 33 | "split": "949,50,1", 34 | "checkpoint-activations": true, 35 | "checkpoint-num-layers": 1, 36 | "partition-activations": true, 37 | "synchronize-each-layer": true, 38 | "gradient_clipping": 1.0, 39 | "weight-decay": 0.0, 40 | "hidden-dropout": 0.0, 41 | "attention-dropout": 0.0, 42 | "fp16": { 43 | "enabled": true, 44 | "loss_scale": 0, 45 | "loss_scale_window": 1000, 46 | "hysteresis": 2, 47 | "min_loss_scale": 1 48 | }, 49 | "train-iters": 320000, 50 | "lr-decay-iters": 320000, 51 | "distributed-backend": "nccl", 52 | "lr-decay-style": "cosine", 53 | "warmup": 0.01, 54 | "checkpoint-factor": 10000, 55 | "eval-interval": 1000, 56 | "eval-iters": 10, 57 | "log-interval": 100, 58 | "steps_per_print": 10, 59 | "keep-last-n-checkpoints": 4, 60 | "wall_clock_breakdown": true, 61 | "launcher": "slurm", 62 | "deepspeed_slurm": true, 63 | "comment": "neox" 64 | } 65 | -------------------------------------------------------------------------------- /scripts/train_1B_ema_0.5smoothing.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing" 6 | DATA_SAMPLING_METHOD="smoothed_mean" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | MIXED_MINIBATCHES=true 10 | SMOOTHING_FACTOR="0.5" 11 | 12 | # SEEDS=(1234 42 100 222) 13 | SEEDS=(42) 14 | 15 | # RUN SPECIFIC CONFIGS 16 | for SEED in ${SEEDS[@]}; do 17 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 18 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}" 19 | python3 tools/create_run_specific_config.py ${ARGS} 20 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 21 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 22 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 23 | 24 | # evaluate 0-shot 25 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 26 | # evaluate 1-shot through 5-shot 27 | for i in {1..5}; do 28 | bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 29 | done 30 | wait < <(jobs -p) 31 | done -------------------------------------------------------------------------------- /scripts/train_3B_ema_0.5smoothing.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/3B.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="3B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing" 6 | DATA_SAMPLING_METHOD="smoothed_mean" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | MIXED_MINIBATCHES=true 10 | SMOOTHING_FACTOR="0.5" 11 | 12 | # SEEDS=(1234 42 100 222) 13 | SEEDS=(42) 14 | 15 | # RUN SPECIFIC CONFIGS 16 | for SEED in ${SEEDS[@]}; do 17 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 18 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}" 19 | python3 tools/create_run_specific_config.py ${ARGS} 20 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 21 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 22 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 23 | 24 | # # evaluate 0-shot 25 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 26 | # # evaluate 1-shot through 5-shot 27 | # for i in {1..5}; do 28 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 29 | # done 30 | # wait < <(jobs -p) 31 | done -------------------------------------------------------------------------------- /scripts/train_1B_ema_0.5smoothing_150B_tokens.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_150B_tokens.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing_150B_tokens" 6 | DATA_SAMPLING_METHOD="smoothed_mean" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | MIXED_MINIBATCHES=true 10 | SMOOTHING_FACTOR="0.5" 11 | 12 | # SEEDS=(1234 42 100 222) 13 | SEEDS=(42) 14 | 15 | # RUN SPECIFIC CONFIGS 16 | for SEED in ${SEEDS[@]}; do 17 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 18 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}" 19 | python3 tools/create_run_specific_config.py ${ARGS} 20 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 21 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 22 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 23 | 24 | # # evaluate 0-shot 25 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 26 | # # evaluate 1-shot through 5-shot 27 | # for i in {1..5}; do 28 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 29 | # done 30 | # wait < <(jobs -p) 31 | done -------------------------------------------------------------------------------- /scripts/train_1B_seqlen2048_ema_0.5smoothing.sh: -------------------------------------------------------------------------------- 1 | # GENERAL CONFIGS THAT WILL BE USED FOR ALL RUNS 2 | CONFIGS="alon_configs/data/pile.yml alon_configs/init.yml alon_configs/models/1B_seqlen2048.yml alon_configs/eval_tasks.yml alon_configs/train_data_weights/original_pile.yml" 3 | # Doesn't include alon_configs/parallelism.yml 4 | 5 | WANDB_GROUP="1B_seqlen2048_ods_smoothed_mean_mixed_minibatches_original_weights_init_05smoothing" 6 | DATA_SAMPLING_METHOD="smoothed_mean" 7 | DATA_SAMPLING_WARMUP_STEPS="2000" 8 | DATA_SAMPLING_UPDATE_FREQUENCY="1" 9 | MIXED_MINIBATCHES=true 10 | SMOOTHING_FACTOR="0.5" 11 | 12 | # SEEDS=(1234 42 100 222) 13 | SEEDS=(42) 14 | 15 | # RUN SPECIFIC CONFIGS 16 | for SEED in ${SEEDS[@]}; do 17 | RUN_NAME="${WANDB_GROUP}_seed${SEED}" 18 | ARGS="--seed ${SEED} --save outputs/${RUN_NAME} --wandb_group ${WANDB_GROUP} --wandb_run_name seed${SEED} --data_sampling_method ${DATA_SAMPLING_METHOD} --data_sampling_warmup_steps ${DATA_SAMPLING_WARMUP_STEPS} --data_sampling_update_frequency ${DATA_SAMPLING_UPDATE_FREQUENCY} --mixed_minibatches ${MIXED_MINIBATCHES} --data_sampling_smoothing_factor ${SMOOTHING_FACTOR}" 19 | python3 tools/create_run_specific_config.py ${ARGS} 20 | RUN_SPECIFIC_CONFIG="alon_configs/run_specific/${RUN_NAME}.yml" 21 | echo "Running with configs: ${CONFIGS} ${RUN_SPECIFIC_CONFIG}" 22 | python3 deepy.py train.py ${CONFIGS} ${RUN_SPECIFIC_CONFIG} 2>&1 | tee outputs/${RUN_NAME}.log 23 | 24 | # # evaluate 0-shot 25 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu.yml 2>&1 | tee outputs/${RUN_NAME}_eval.log & 26 | # # evaluate 1-shot through 5-shot 27 | # for i in {1..5}; do 28 | # bash scripts/evaluate.sh outputs/${RUN_NAME}/global_step100000/configs/${RUN_NAME}.yml alon_configs/models/eval_1B_1gpu_local${i}.yml ${i} 2>&1 | tee outputs/${RUN_NAME}_${i}shot_eval.log & 29 | # done 30 | # wait < <(jobs -p) 31 | done -------------------------------------------------------------------------------- /configs/125M-json.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | "num-layers": 12, 6 | "hidden-size": 768, 7 | "num-attention-heads": 12, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": false, 17 | "bias-gelu-fusion": false, 18 | 19 | "init_method": "small_init", 20 | "output_layer_init_method": "wang_init", 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0006, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.00006, 31 | 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": true 40 | }, 41 | 42 | "train_micro_batch_size_per_gpu": 4, 43 | "data-impl": "mmap", 44 | 45 | "checkpoint-activations": true, 46 | "checkpoint-num-layers": 1, 47 | "partition-activations": true, 48 | "synchronize-each-layer": true, 49 | 50 | "gradient_clipping": 1.0, 51 | "weight-decay": 0.1, 52 | "hidden-dropout": 0.0, 53 | "attention-dropout": 0.0, 54 | 55 | "fp16": { 56 | "enabled": true, 57 | "loss_scale": 0, 58 | "loss_scale_window": 1000, 59 | "hysteresis": 2, 60 | "min_loss_scale": 1 61 | }, 62 | 63 | "train-iters": 320000, 64 | "lr-decay-iters": 320000, 65 | "distributed-backend": "nccl", 66 | "lr-decay-style": "cosine", 67 | "warmup": 0.01, 68 | "checkpoint-factor": 10000, 69 | "eval-interval": 1000, 70 | "eval-iters": 10, 71 | 72 | "log-interval": 100, 73 | "steps_per_print": 10, 74 | "keep-last-n-checkpoints": 4, 75 | "wall_clock_breakdown": true, 76 | 77 | "hostfile": "/mock_path" 78 | } 79 | -------------------------------------------------------------------------------- /scripts/evaluate.sh: -------------------------------------------------------------------------------- 1 | MODEL_CONFIG=$1 2 | EVAL_CONFIG=$2 3 | STEP=$3 4 | NUM_FEWSHOT=${4:-0} 5 | GPU=${5:-0} 6 | 7 | # Can't use perplexity-based evaluation tasks with in-context examples 8 | if [ ${NUM_FEWSHOT} -eq 0 ]; then 9 | EVAL_TASKS="lambada_openai piqa winogrande wsc arc_easy sciq logiqa wikitext openbookqa hendrycksTest-*" 10 | else 11 | # if using few-shot, then we can't use wikitext 12 | EVAL_TASKS="lambada_openai piqa winogrande wsc arc_easy sciq logiqa openbookqa hendrycksTest-*" 13 | fi 14 | # Temporarily not using triviaqa because it can't download? 15 | # Not using: 16 | # webqs (web questions) because our models have very poor performance (0.005 accuracy) 17 | # squad2 because it leads to: AttributeError: 'SequentialWrapper' object has no attribute 'clear_cache' 18 | 19 | # MODEL_CONFIG should be from the output of the training script 20 | # For example, gpt-neox/outputs/160m_doremi_280_seed42/global_step100000/configs/160m_doremi_280_seed42.yml 21 | # It should have additional fields. Eg.: 22 | # "load": "outputs/160m_doremi_280_seed42", 23 | # "eval_results_prefix": "outputs/160m_doremi_280_seed42", 24 | # CONFIGS="outputs/160m_doremi_280_seed42/global_step100000/configs/160m_doremi_280_seed42.yml alon_configs/models/eval_160m_1gpu.yml" 25 | python3 tools/create_eval_config.py --config_path ${MODEL_CONFIG} --num_fewshot ${NUM_FEWSHOT} --iteration ${STEP} 26 | 27 | # if not using num_fewshot, then you can just use the following: 28 | if [ ${NUM_FEWSHOT} -eq 0 ]; then 29 | # EVAL_MODEL_CONFIG is MODEL_CONFIG with .yml replaced by _eval.yml 30 | EVAL_MODEL_CONFIG=${MODEL_CONFIG%.yml}_eval.yml 31 | else 32 | EVAL_MODEL_CONFIG=${MODEL_CONFIG%.yml}_eval_${NUM_FEWSHOT}shot.yml 33 | fi 34 | 35 | # Get GPU Config 36 | GPU_CONFIG=alon_configs/gpu/gpu${GPU}.yml 37 | 38 | 39 | # EVAL_CONFIG should be in the configs folder. See alon_configs/models/eval_160m_1gpu.yml for an example 40 | python ./deepy.py evaluate.py ${EVAL_MODEL_CONFIG} ${EVAL_CONFIG} ${GPU_CONFIG} --eval_tasks ${EVAL_TASKS} -------------------------------------------------------------------------------- /configs/gmlp_small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | "attention_config": [[["gmlp"], "all"]], 8 | 9 | 10 | # model settings 11 | "num-layers": 12, 12 | "hidden-size": 768, # gmlp d_ff defaults to hidden_size * 4 13 | "gmlp_attn_dim": 64, 14 | "num-attention-heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention. 15 | "seq-length": 2048, 16 | "max-position-embeddings": 2048, 17 | "norm": "layernorm", 18 | "pos-emb": "none", 19 | "no-weight-tying": true, 20 | 21 | # optimizer settings 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0006, 26 | "betas": [0.9, 0.999], 27 | "eps": 1.0e-8, 28 | } 29 | }, 30 | 31 | # batch / data settings 32 | "train_micro_batch_size_per_gpu": 4, 33 | "data-impl": "mmap", 34 | "split": "949,50,1", 35 | 36 | # activation checkpointing 37 | "checkpoint-activations": true, 38 | "checkpoint-num-layers": 1, 39 | "partition-activations": false, 40 | "synchronize-each-layer": true, 41 | 42 | # regularization 43 | "gradient_clipping": 1.0, 44 | "weight-decay": 0.1, 45 | "hidden-dropout": 0.0, 46 | "attention-dropout": 0.0, 47 | 48 | # precision settings 49 | "fp16": { 50 | "enabled": true, 51 | "loss_scale": 0, 52 | "loss_scale_window": 1000, 53 | "hysteresis": 2, 54 | "min_loss_scale": 1 55 | }, 56 | 57 | # misc. training settings 58 | "train-iters": 320000, 59 | "lr-decay-iters": 320000, 60 | "distributed-backend": "nccl", 61 | "lr-decay-style": "cosine", 62 | "warmup": 0.01, 63 | "checkpoint-factor": 10000, 64 | "eval-interval": 1000, 65 | "eval-iters": 10, 66 | 67 | # logging 68 | "log-interval": 100, 69 | "steps_per_print": 10, 70 | "keep-last-n-checkpoints": 4, 71 | "wall_clock_breakdown": true, 72 | } 73 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI contributors 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from typing import Optional 19 | from torch import Tensor 20 | 21 | # flags required to enable jit fusion kernels 22 | torch._C._jit_set_profiling_mode(False) 23 | torch._C._jit_set_profiling_executor(False) 24 | torch._C._jit_override_can_fuse_on_cpu(True) 25 | torch._C._jit_override_can_fuse_on_gpu(True) 26 | 27 | 28 | def bias_dropout_add( 29 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool 30 | ) -> Tensor: 31 | out = torch.nn.functional.dropout(x + bias, p=prob, training=training) 32 | if residual is not None: 33 | out = residual + out 34 | return out 35 | 36 | 37 | def get_bias_dropout_add(training): 38 | def _bias_dropout_add(x, bias, residual, prob): 39 | return bias_dropout_add(x, bias, residual, prob, training) 40 | 41 | return _bias_dropout_add 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_fused_train( 46 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add(x, bias, residual, prob, True) 49 | 50 | 51 | @torch.jit.script 52 | def bias_dropout_add_fused_inference( 53 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 54 | ) -> Tensor: 55 | return bias_dropout_add(x, bias, residual, prob, False) 56 | -------------------------------------------------------------------------------- /configs/19M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | # model settings 6 | "num-layers": 6, 7 | "hidden-size": 512, 8 | "num-attention-heads": 8, 9 | "seq-length": 2048, 10 | "max-position-embeddings": 2048, 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | "gpt-j-residual": false, 14 | "output-layer-parallelism": "column", 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": false, 17 | "bias-gelu-fusion": false, 18 | 19 | # init methods 20 | "init_method": "small_init", 21 | "output_layer_init_method": "wang_init", 22 | 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.001, 27 | "betas": [0.9, 0.95], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | "min_lr": 0.0001, 32 | 33 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": True, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": True, 39 | "reduce_scatter": True, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": True, 42 | }, 43 | 44 | "train_micro_batch_size_per_gpu": 4, #32, 45 | "gas": 1, 46 | "data-impl": "mmap", 47 | "num_workers": 1, 48 | 49 | # activation checkpointing 50 | "checkpoint-activations": true, 51 | "checkpoint-num-layers": 1, 52 | "partition-activations": true, 53 | "synchronize-each-layer": true, 54 | 55 | # regularization 56 | "gradient_clipping": 1.0, 57 | "weight-decay": 0.1, 58 | "hidden-dropout": 0, 59 | "attention-dropout": 0, 60 | 61 | # precision settings 62 | "fp16": { 63 | "fp16": true, 64 | "enabled": true, 65 | "loss_scale": 0, 66 | "loss_scale_window": 1000, 67 | "initial_scale_power": 12, 68 | "hysteresis": 2, 69 | "min_loss_scale": 1, 70 | }, 71 | 72 | "train-iters": 143000, 73 | "lr-decay-iters": 143000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 1000, 78 | "eval-interval": 100000, 79 | "eval-iters": 10, 80 | 81 | "log-interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /configs/800M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | # model settings 6 | "num-layers": 16, 7 | "hidden-size": 2048, 8 | "num-attention-heads": 8, 9 | "seq-length": 2048, 10 | "max-position-embeddings": 2048, 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | "gpt-j-residual": false, 14 | "output-layer-parallelism": "column", 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": false, 17 | "bias-gelu-fusion": false, 18 | 19 | # init methods 20 | "init_method": "small_init", 21 | "output_layer_init_method": "wang_init", 22 | 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.00025, 27 | "betas": [0.9, 0.95], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | "min_lr": 0.000025, 32 | 33 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": True, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": True, 39 | "reduce_scatter": True, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": True, 42 | }, 43 | 44 | "train_micro_batch_size_per_gpu": 16, 45 | "gas": 1, 46 | "data-impl": "mmap", 47 | "num_workers": 1, 48 | 49 | # activation checkpointing 50 | "checkpoint-activations": true, 51 | "checkpoint-num-layers": 1, 52 | "partition-activations": true, 53 | "synchronize-each-layer": true, 54 | 55 | # regularization 56 | "gradient_clipping": 1.0, 57 | "weight-decay": 0.1, 58 | "hidden-dropout": 0, 59 | "attention-dropout": 0, 60 | 61 | # precision settings 62 | "fp16": { 63 | "fp16": true, 64 | "enabled": true, 65 | "loss_scale": 0, 66 | "loss_scale_window": 1000, 67 | "initial_scale_power": 12, 68 | "hysteresis": 2, 69 | "min_loss_scale": 1, 70 | }, 71 | 72 | "train-iters": 143000, 73 | "lr-decay-iters": 143000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 1000, 78 | "eval-interval": 40000, 79 | "eval-iters": 10, 80 | 81 | "log-interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from tools.corpora import prepare_dataset, DATA_DOWNLOADERS 16 | import argparse 17 | 18 | TOKENIZER_CHOICES = [ 19 | "HFGPT2Tokenizer", 20 | "HFTokenizer", 21 | "GPT2BPETokenizer", 22 | "CharLevelTokenizer", 23 | "TiktokenTokenizer", 24 | ] 25 | DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"] 26 | 27 | 28 | def get_args(): 29 | parser = argparse.ArgumentParser(description="Download & preprocess neox datasets") 30 | parser.add_argument( 31 | "dataset", 32 | nargs="?", 33 | default="enron", 34 | help="name of dataset to download.", 35 | choices=DATASET_CHOICES, 36 | ) 37 | parser.add_argument( 38 | "-t", 39 | "--tokenizer", 40 | default="GPT2BPETokenizer", 41 | choices=TOKENIZER_CHOICES, 42 | help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}', 43 | ) 44 | parser.add_argument( 45 | "-d", 46 | "--data-dir", 47 | default=None, 48 | help=f"Directory to which to download datasets / tokenizer " 49 | f"files - defaults to ./data", 50 | ) 51 | parser.add_argument( 52 | "-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)" 53 | ) 54 | parser.add_argument( 55 | "-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)" 56 | ) 57 | return parser.parse_args() 58 | 59 | 60 | if __name__ == "__main__": 61 | args = get_args() 62 | prepare_dataset( 63 | dataset_name=args.dataset, 64 | tokenizer_type=args.tokenizer, 65 | data_dir=args.data_dir, 66 | vocab_file=args.vocab_file, 67 | merge_file=args.merge_file, 68 | ) 69 | -------------------------------------------------------------------------------- /megatron/fused_kernels/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from torch.utils import cpp_extension 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | from pathlib import Path 5 | import subprocess 6 | 7 | 8 | def _get_cuda_bare_metal_version(cuda_dir): 9 | raw_output = subprocess.check_output( 10 | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True 11 | ) 12 | output = raw_output.split() 13 | release_idx = output.index("release") + 1 14 | release = output[release_idx].split(".") 15 | bare_metal_major = release[0] 16 | bare_metal_minor = release[1][0] 17 | 18 | return raw_output, bare_metal_major, bare_metal_minor 19 | 20 | 21 | srcpath = Path(__file__).parent.absolute() 22 | cc_flag = [] 23 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 24 | if int(bare_metal_major) >= 11: 25 | cc_flag.append("-gencode") 26 | cc_flag.append("arch=compute_80,code=sm_80") 27 | 28 | nvcc_flags = [ 29 | "-O3", 30 | "-gencode", 31 | "arch=compute_70,code=sm_70", 32 | "--use_fast_math", 33 | "-U__CUDA_NO_HALF_OPERATORS__", 34 | "-U__CUDA_NO_HALF_CONVERSIONS__", 35 | "--expt-relaxed-constexpr", 36 | "--expt-extended-lambda", 37 | ] 38 | cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag} 39 | layernorm_cuda_args = { 40 | "cxx": ["-O3"], 41 | "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"], 42 | } 43 | setup( 44 | name="fused_kernels", 45 | version="0.0.1", 46 | author="Sid Black & Alejandro Molina et al.", 47 | author_email="alejandro.molina@aleph-alpha.de", 48 | include_package_data=False, 49 | ext_modules=[ 50 | CUDAExtension( 51 | "scaled_upper_triang_masked_softmax_cuda", 52 | [ 53 | str(srcpath / "scaled_upper_triang_masked_softmax.cpp"), 54 | str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"), 55 | ], 56 | extra_compile_args=cuda_ext_args, 57 | ), 58 | CUDAExtension( 59 | "scaled_masked_softmax_cuda", 60 | [ 61 | str(srcpath / "scaled_masked_softmax.cpp"), 62 | str(srcpath / "scaled_masked_softmax_cuda.cu"), 63 | ], 64 | extra_compile_args=cuda_ext_args, 65 | ), 66 | ], 67 | cmdclass={"build_ext": BuildExtension}, 68 | ) 69 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Model parallel utility interface.""" 16 | 17 | from .cross_entropy import vocab_parallel_cross_entropy 18 | 19 | from .data import broadcast_data 20 | 21 | from .initialize import is_unitialized 22 | from .initialize import destroy_model_parallel 23 | from .initialize import get_data_parallel_group 24 | from .initialize import get_data_parallel_rank 25 | from .initialize import get_data_parallel_world_size 26 | from .initialize import get_model_parallel_group 27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank 29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 30 | from .initialize import get_topology 31 | from .initialize import get_pipe_parallel_group 32 | from .initialize import get_pipe_parallel_rank 33 | from .initialize import get_pipe_parallel_world_size 34 | from .initialize import get_io_parallel_group 35 | from .initialize import initialize_model_parallel 36 | from .initialize import model_parallel_is_initialized 37 | 38 | from .layers import ColumnParallelLinear 39 | from .layers import RowParallelLinear 40 | from .layers import VocabParallelEmbedding 41 | from .layers import ParallelRelativePositionBias 42 | 43 | from .mappings import copy_to_model_parallel_region 44 | from .mappings import gather_from_model_parallel_region 45 | from .mappings import reduce_from_model_parallel_region 46 | from .mappings import scatter_to_model_parallel_region 47 | 48 | from .random import checkpoint 49 | from .random import get_cuda_rng_tracker 50 | from .random import model_parallel_cuda_manual_seed 51 | 52 | from .utils import divide 53 | from .utils import split_tensor_along_last_dim 54 | -------------------------------------------------------------------------------- /configs/49M.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings 3 | "pipe-parallel-size": 1, 4 | "model-parallel-size": 1, 5 | 6 | # model settings 7 | "num-layers": 10, 8 | "hidden-size": 640, 9 | "num-attention-heads": 10, 10 | "seq-length": 2048, 11 | "max-position-embeddings": 2048, 12 | "pos-emb": "rotary", 13 | "rotary-pct": 0.25, 14 | "no-weight-tying": true, 15 | "gpt-j-residual": true, 16 | "output-layer-parallelism": "column", 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | # init methods 23 | "init_method": "small_init", 24 | "output_layer_init_method": "wang_init", 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.0008, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8, 33 | } 34 | }, 35 | "min_lr": 0.00008, 36 | 37 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": True, 41 | "allgather_bucket_size": 500000000, 42 | "overlap_comm": True, 43 | "reduce_scatter": True, 44 | "reduce_bucket_size": 500000000, 45 | "contiguous_gradients": True, 46 | }, 47 | 48 | # batch / data settings 49 | "train_micro_batch_size_per_gpu": 32, 50 | "gas": 1, 51 | "data-impl": "mmap", 52 | "num_workers": 1, 53 | 54 | # activation checkpointing 55 | "checkpoint-activations": true, 56 | "checkpoint-num-layers": 1, 57 | "partition-activations": true, 58 | "synchronize-each-layer": true, 59 | 60 | # regularization 61 | "gradient_clipping": 1.0, 62 | "weight-decay": 0.1, 63 | "hidden-dropout": 0, 64 | "attention-dropout": 0, 65 | 66 | # precision settings 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "initial_scale_power": 12, 73 | "hysteresis": 2, 74 | "min_loss_scale": 1, 75 | }, 76 | 77 | # misc. training settings 78 | "train-iters": 143000, 79 | "lr-decay-iters": 143000, 80 | "distributed-backend": "nccl", 81 | "lr-decay-style": "cosine", 82 | "warmup": 0.01, 83 | "checkpoint-factor": 1000, 84 | "eval-interval": 100000, 85 | "eval-iters": 10, 86 | 87 | # logging 88 | "log-interval": 10, 89 | "steps_per_print": 10, 90 | "wall_clock_breakdown": true, 91 | } 92 | -------------------------------------------------------------------------------- /configs/bnb_125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "use-bnb-optimizer": true, 18 | 19 | # these should provide some speedup but takes a while to build, set to true if desired 20 | "scaled-upper-triang-masked-softmax-fusion": false, 21 | "bias-gelu-fusion": false, 22 | 23 | 24 | # optimizer settings 25 | "optimizer": { 26 | "type": "Adam", 27 | "params": { 28 | "lr": 0.0006, 29 | "betas": [0.9, 0.999], 30 | "eps": 1.0e-8, 31 | } 32 | }, 33 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 34 | "zero_optimization": { 35 | "stage": 0, 36 | "allgather_partitions": True, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": True, 39 | "reduce_scatter": True, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": True, 42 | }, 43 | 44 | # batch / data settings 45 | "train_micro_batch_size_per_gpu": 4, 46 | "data-impl": "mmap", 47 | "split": "949,50,1", 48 | 49 | # activation checkpointing 50 | "checkpoint-activations": true, 51 | "checkpoint-num-layers": 1, 52 | "partition-activations": true, 53 | "synchronize-each-layer": true, 54 | 55 | # regularization 56 | "gradient_clipping": 1.0, 57 | "weight-decay": 0.0, 58 | "hidden-dropout": 0.0, 59 | "attention-dropout": 0.0, 60 | 61 | # precision settings 62 | "fp16": { 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # misc. training settings 71 | "train-iters": 320000, 72 | "lr-decay-iters": 320000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | "eval-iters": 10, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "keep-last-n-checkpoints": 4, 84 | "wall_clock_breakdown": true, 85 | } 86 | -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_1gpu.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_seqlen2048_1gpu.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 2048, 7 | "max-position-embeddings": 2048, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | } -------------------------------------------------------------------------------- /alon_configs/models/1B_unnamed_train_datasets.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 480, 20 | "train_micro_batch_size_per_gpu": 60, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_eval_datasets": true, 94 | "max_validation_samples_per_dataset": 5000, 95 | 96 | # "keep_last_n_checkpoints": 1 97 | 98 | } -------------------------------------------------------------------------------- /alon_configs/models/3B_unnamed_train_datasets.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings, 2,846,767,360 parameters 3 | "num-layers": 32, 4 | "hidden-size": 2560, 5 | "num-attention-heads": 32, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "norm": "layernorm", 9 | "pos-emb": "rotary", 10 | "rotary-pct": 0.25, 11 | "no-weight-tying": true, 12 | "gpt-j-residual": true, 13 | "output-layer-parallelism": "column", 14 | 15 | "attention-config": [[["flash"], 32]], 16 | 17 | "scaled-upper-triang-masked-softmax-fusion": true, 18 | "bias-gelu-fusion": true, 19 | 20 | "train_batch_size": 480, 21 | "train_micro_batch_size_per_gpu": 10, 22 | "gradient_accumulation_steps": 6, 23 | "data-impl": "mmap", 24 | "num_workers": 1, 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00016, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.000016, 36 | 37 | "zero_optimization": { 38 | "stage": 1, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.1, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "initial_scale_power": 12, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1 69 | }, 70 | 71 | # train settings 72 | "train-iters": 100000, 73 | "lr-decay-iters": 100000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | 80 | # logging 81 | "log-interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | # tokenizer settings 86 | "tokenizer-type": "HFTokenizer", 87 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 88 | 89 | # wandb settings 90 | "use_wandb": true, 91 | "wandb_project": "neox", 92 | 93 | # domain specific settings 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | } -------------------------------------------------------------------------------- /alon_configs/models/1B_seqlen2048_unnamed_train_datasets.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 2048, 7 | "max-position-embeddings": 2048, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 240, 20 | "train_micro_batch_size_per_gpu": 30, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_eval_datasets": true, 94 | "max_validation_samples_per_dataset": 5000, 95 | 96 | # "keep_last_n_checkpoints": 1 97 | 98 | } -------------------------------------------------------------------------------- /alon_configs/models/410m.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 24, 4 | "hidden-size": 1024, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 24]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 480, 20 | "train_micro_batch_size_per_gpu": 60, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.0003, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.00003, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1 98 | 99 | } -------------------------------------------------------------------------------- /alon_configs/models/1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 480, 20 | "train_micro_batch_size_per_gpu": 60, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | # "keep_last_n_checkpoints": 1 98 | 99 | } -------------------------------------------------------------------------------- /alon_configs/models/1B_seqlen2048.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 2048, 7 | "max-position-embeddings": 2048, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 240, 20 | "train_micro_batch_size_per_gpu": 30, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | # "keep_last_n_checkpoints": 1 98 | 99 | } -------------------------------------------------------------------------------- /alon_configs/models/1B_150B_tokens.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 480, 20 | "train_micro_batch_size_per_gpu": 60, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 300000, 72 | "lr-decay-iters": 300000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | # "keep_last_n_checkpoints": 1 98 | 99 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_3B_1gpu.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings, 2,846,767,360 parameters 3 | "num-layers": 32, 4 | "hidden-size": 2560, 5 | "num-attention-heads": 32, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "norm": "layernorm", 9 | "pos-emb": "rotary", 10 | "rotary-pct": 0.25, 11 | "no-weight-tying": true, 12 | "gpt-j-residual": true, 13 | "output-layer-parallelism": "column", 14 | 15 | "attention-config": [[["flash"], 32]], 16 | 17 | "scaled-upper-triang-masked-softmax-fusion": true, 18 | "bias-gelu-fusion": true, 19 | 20 | "train_batch_size": 10, 21 | "train_micro_batch_size_per_gpu": 10, 22 | "gradient_accumulation_steps": 1, 23 | "data-impl": "mmap", 24 | "num_workers": 1, 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00016, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.000016, 36 | 37 | "zero_optimization": { 38 | "stage": 1, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.1, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "initial_scale_power": 12, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1 69 | }, 70 | 71 | # train settings 72 | "train-iters": 100000, 73 | "lr-decay-iters": 100000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | 80 | # logging 81 | "log-interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | # tokenizer settings 86 | "tokenizer-type": "HFTokenizer", 87 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 88 | 89 | # wandb settings 90 | "use_wandb": true, 91 | "wandb_project": "neox", 92 | 93 | # domain specific settings 94 | "use_named_train_datasets": true, 95 | "use_named_eval_datasets": true, 96 | "max_validation_samples_per_dataset": 5000, 97 | 98 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_3B_seqlen2048_1gpu.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings, 2,846,767,360 parameters 3 | "num-layers": 32, 4 | "hidden-size": 2560, 5 | "num-attention-heads": 32, 6 | "seq-length": 2048, 7 | "max-position-embeddings": 2048, 8 | "norm": "layernorm", 9 | "pos-emb": "rotary", 10 | "rotary-pct": 0.25, 11 | "no-weight-tying": true, 12 | "gpt-j-residual": true, 13 | "output-layer-parallelism": "column", 14 | 15 | "attention-config": [[["flash"], 32]], 16 | 17 | "scaled-upper-triang-masked-softmax-fusion": true, 18 | "bias-gelu-fusion": true, 19 | 20 | "train_batch_size": 10, 21 | "train_micro_batch_size_per_gpu": 10, 22 | "gradient_accumulation_steps": 1, 23 | "data-impl": "mmap", 24 | "num_workers": 1, 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00016, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.000016, 36 | 37 | "zero_optimization": { 38 | "stage": 1, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.1, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "initial_scale_power": 12, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1 69 | }, 70 | 71 | # train settings 72 | "train-iters": 100000, 73 | "lr-decay-iters": 100000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | 80 | # logging 81 | "log-interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | # tokenizer settings 86 | "tokenizer-type": "HFTokenizer", 87 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 88 | 89 | # wandb settings 90 | "use_wandb": true, 91 | "wandb_project": "neox", 92 | 93 | # domain specific settings 94 | "use_named_train_datasets": true, 95 | "use_named_eval_datasets": true, 96 | "max_validation_samples_per_dataset": 5000, 97 | 98 | } -------------------------------------------------------------------------------- /configs/bf16_125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled-upper-triang-masked-softmax-fusion": false, 20 | "bias-gelu-fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 33 | "zero_optimization": { 34 | "stage": 0, 35 | "allgather_partitions": True, 36 | "allgather_bucket_size": 500000000, 37 | "overlap_comm": True, 38 | "reduce_scatter": True, 39 | "reduce_bucket_size": 500000000, 40 | "contiguous_gradients": True, 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data-impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.0, 57 | "hidden-dropout": 0.0, 58 | "attention-dropout": 0.0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "enabled": true, 63 | "type": "bfloat16", # set bf16 as precision 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32 71 | # misc. training settings 72 | "train-iters": 320000, 73 | "lr-decay-iters": 320000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | "eval-iters": 10, 80 | 81 | # logging 82 | "log-interval": 100, 83 | "steps_per_print": 10, 84 | "keep-last-n-checkpoints": 4, 85 | "wall_clock_breakdown": true, 86 | } 87 | -------------------------------------------------------------------------------- /configs/175B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 96, 10 | "hidden-size": 12288, 11 | "num-attention-heads": 96, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.00006, 33 | "betas": [0.9, 0.95], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | "min_lr": 0.000006, 38 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 39 | "zero_optimization": { 40 | "stage": 1, 41 | "allgather_partitions": True, 42 | "allgather_bucket_size": 500000000, 43 | "overlap_comm": True, 44 | "reduce_scatter": True, 45 | "reduce_bucket_size": 500000000, 46 | "contiguous_gradients": True, 47 | }, 48 | 49 | # batch / data settings 50 | "train_micro_batch_size_per_gpu": 4, 51 | "data-impl": "mmap", 52 | 53 | # activation checkpointing 54 | "checkpoint-activations": true, 55 | "checkpoint-num-layers": 1, 56 | "partition-activations": true, 57 | "synchronize-each-layer": true, 58 | 59 | # regularization 60 | "gradient_clipping": 1.0, 61 | "weight-decay": 0.1, 62 | "hidden-dropout": 0, 63 | "attention-dropout": 0, 64 | 65 | # precision settings 66 | "fp16": { 67 | "fp16": true, 68 | "enabled": true, 69 | "loss_scale": 0, 70 | "loss_scale_window": 1000, 71 | "hysteresis": 2, 72 | "min_loss_scale": 1 73 | }, 74 | 75 | # misc. training settings 76 | "train-iters": 320000, 77 | "lr-decay-iters": 320000, 78 | "distributed-backend": "nccl", 79 | "lr-decay-style": "cosine", 80 | "warmup": 0.01, 81 | "checkpoint-factor": 10000, 82 | "eval-interval": 1000, 83 | "eval-iters": 10, 84 | 85 | # logging 86 | "log-interval": 100, 87 | "steps_per_print": 10, 88 | "keep-last-n-checkpoints": 4, 89 | "wall_clock_breakdown": true, 90 | } 91 | -------------------------------------------------------------------------------- /configs/350M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 24, 10 | "hidden-size": 1024, 11 | "num-attention-heads": 16, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.0003, 33 | "betas": [0.9, 0.95], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | "min_lr": 0.00003, 38 | 39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 40 | "zero_optimization": { 41 | "stage": 1, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | # batch / data settings 50 | "train_micro_batch_size_per_gpu": 4, 51 | "data-impl": "mmap", 52 | 53 | # activation checkpointing 54 | "checkpoint-activations": true, 55 | "checkpoint-num-layers": 1, 56 | "partition-activations": true, 57 | "synchronize-each-layer": true, 58 | 59 | # regularization 60 | "gradient_clipping": 1.0, 61 | "weight-decay": 0.1, 62 | "hidden-dropout": 0, 63 | "attention-dropout": 0, 64 | 65 | # precision settings 66 | "fp16": { 67 | "fp16": true, 68 | "enabled": true, 69 | "loss_scale": 0, 70 | "loss_scale_window": 1000, 71 | "hysteresis": 2, 72 | "min_loss_scale": 1 73 | }, 74 | 75 | # misc. training settings 76 | "train-iters": 320000, 77 | "lr-decay-iters": 320000, 78 | "distributed-backend": "nccl", 79 | "lr-decay-style": "cosine", 80 | "warmup": 0.01, 81 | "checkpoint-factor": 10000, 82 | "eval-interval": 1000, 83 | "eval-iters": 10, 84 | 85 | # logging 86 | "log-interval": 100, 87 | "steps_per_print": 10, 88 | "keep-last-n-checkpoints": 4, 89 | "wall_clock_breakdown": true, 90 | } 91 | -------------------------------------------------------------------------------- /configs/1-3B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 24, 10 | "hidden-size": 2048, 11 | "num-attention-heads": 16, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.0002, 33 | "betas": [0.9, 0.95], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | "min_lr": 0.00002, 38 | 39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 40 | "zero_optimization": { 41 | "stage": 1, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | # batch / data settings 51 | "train_micro_batch_size_per_gpu": 4, 52 | "data-impl": "mmap", 53 | 54 | # activation checkpointing 55 | "checkpoint-activations": true, 56 | "checkpoint-num-layers": 1, 57 | "partition-activations": true, 58 | "synchronize-each-layer": true, 59 | 60 | # regularization 61 | "gradient_clipping": 1.0, 62 | "weight-decay": 0.1, 63 | "hidden-dropout": 0, 64 | "attention-dropout": 0, 65 | 66 | # precision settings 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | # misc. training settings 77 | "train-iters": 320000, 78 | "lr-decay-iters": 320000, 79 | "distributed-backend": "nccl", 80 | "lr-decay-style": "cosine", 81 | "warmup": 0.01, 82 | "checkpoint-factor": 10000, 83 | "eval-interval": 1000, 84 | "eval-iters": 10, 85 | 86 | # logging 87 | "log-interval": 100, 88 | "steps_per_print": 10, 89 | "keep-last-n-checkpoints": 4, 90 | "wall_clock_breakdown": true, 91 | } 92 | -------------------------------------------------------------------------------- /configs/2-7B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 32, 10 | "hidden-size": 2560, 11 | "num-attention-heads": 32, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.00016, 33 | "betas": [0.9, 0.95], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | "min_lr": 0.000016, 38 | 39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 40 | "zero_optimization": { 41 | "stage": 1, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | # batch / data settings 51 | "train_micro_batch_size_per_gpu": 4, 52 | "data-impl": "mmap", 53 | 54 | # activation checkpointing 55 | "checkpoint-activations": true, 56 | "checkpoint-num-layers": 1, 57 | "partition-activations": true, 58 | "synchronize-each-layer": true, 59 | 60 | # regularization 61 | "gradient_clipping": 1.0, 62 | "weight-decay": 0.1, 63 | "hidden-dropout": 0, 64 | "attention-dropout": 0, 65 | 66 | # precision settings 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | # misc. training settings 77 | "train-iters": 320000, 78 | "lr-decay-iters": 320000, 79 | "distributed-backend": "nccl", 80 | "lr-decay-style": "cosine", 81 | "warmup": 0.01, 82 | "checkpoint-factor": 10000, 83 | "eval-interval": 1000, 84 | "eval-iters": 10, 85 | 86 | # logging 87 | "log-interval": 100, 88 | "steps_per_print": 10, 89 | "keep-last-n-checkpoints": 4, 90 | "wall_clock_breakdown": true, 91 | } 92 | -------------------------------------------------------------------------------- /configs/6-7B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 32, 10 | "hidden-size": 4096, 11 | "num-attention-heads": 32, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.00012, 33 | "betas": [0.9, 0.95], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | 38 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 39 | "zero_optimization": { 40 | "stage": 1, 41 | "allgather_partitions": True, 42 | "allgather_bucket_size": 500000000, 43 | "overlap_comm": True, 44 | "reduce_scatter": True, 45 | "reduce_bucket_size": 500000000, 46 | "contiguous_gradients": True, 47 | }, 48 | "min_lr": 0.000012, 49 | 50 | # batch / data settings 51 | "train_micro_batch_size_per_gpu": 4, 52 | "data-impl": "mmap", 53 | 54 | # activation checkpointing 55 | "checkpoint-activations": true, 56 | "checkpoint-num-layers": 1, 57 | "partition-activations": true, 58 | "synchronize-each-layer": true, 59 | 60 | # regularization 61 | "gradient_clipping": 1.0, 62 | "weight-decay": 0.1, 63 | "hidden-dropout": 0, 64 | "attention-dropout": 0, 65 | 66 | # precision settings 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | # misc. training settings 77 | "train-iters": 320000, 78 | "lr-decay-iters": 320000, 79 | "distributed-backend": "nccl", 80 | "lr-decay-style": "cosine", 81 | "warmup": 0.01, 82 | "checkpoint-factor": 10000, 83 | "eval-interval": 1000, 84 | "eval-iters": 10, 85 | 86 | # logging 87 | "log-interval": 100, 88 | "steps_per_print": 10, 89 | "keep-last-n-checkpoints": 4, 90 | "wall_clock_breakdown": true, 91 | } 92 | -------------------------------------------------------------------------------- /configs/13B.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 40, 10 | "hidden-size": 5120, 11 | "num-attention-heads": 40, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | 29 | # optimizer settings 30 | "optimizer": { 31 | "type": "Adam", 32 | "params": { 33 | "lr": 0.0001, 34 | "betas": [0.9, 0.95], 35 | "eps": 1.0e-8, 36 | } 37 | }, 38 | 39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 40 | "zero_optimization": { 41 | "stage": 1, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | "min_lr": 0.00001, 50 | 51 | # batch / data settings 52 | "train_micro_batch_size_per_gpu": 4, 53 | "data-impl": "mmap", 54 | 55 | # activation checkpointing 56 | "checkpoint-activations": true, 57 | "checkpoint-num-layers": 1, 58 | "partition-activations": true, 59 | "synchronize-each-layer": true, 60 | 61 | # regularization 62 | "gradient_clipping": 1.0, 63 | "weight-decay": 0.1, 64 | "hidden-dropout": 0, 65 | "attention-dropout": 0, 66 | 67 | # precision settings 68 | "fp16": { 69 | "fp16": true, 70 | "enabled": true, 71 | "loss_scale": 0, 72 | "loss_scale_window": 1000, 73 | "hysteresis": 2, 74 | "min_loss_scale": 1 75 | }, 76 | 77 | # misc. training settings 78 | "train-iters": 320000, 79 | "lr-decay-iters": 320000, 80 | "distributed-backend": "nccl", 81 | "lr-decay-style": "cosine", 82 | "warmup": 0.01, 83 | "checkpoint-factor": 10000, 84 | "eval-interval": 1000, 85 | "eval-iters": 10, 86 | 87 | # logging 88 | "log-interval": 100, 89 | "steps_per_print": 10, 90 | "keep-last-n-checkpoints": 4, 91 | "wall_clock_breakdown": true, 92 | } 93 | -------------------------------------------------------------------------------- /configs/760M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 24, 10 | "hidden-size": 1536, 11 | "num-attention-heads": 16, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | # optimizer settings 29 | "optimizer": { 30 | "type": "Adam", 31 | "params": { 32 | "lr": 0.00025, 33 | "betas": [0.9, 0.999], 34 | "eps": 1.0e-8, 35 | } 36 | }, 37 | "min_lr": 0.000025, 38 | 39 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 40 | "zero_optimization": { 41 | "stage": 1, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | # batch / data settings 51 | "train_micro_batch_size_per_gpu": 4, 52 | "data-impl": "mmap", 53 | 54 | # activation checkpointing 55 | "checkpoint-activations": true, 56 | "checkpoint-num-layers": 1, 57 | "partition-activations": true, 58 | "synchronize-each-layer": true, 59 | 60 | # regularization 61 | "gradient_clipping": 1.0, 62 | "weight-decay": 0.1, 63 | "hidden-dropout": 0, 64 | "attention-dropout": 0, 65 | 66 | # precision settings 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | # misc. training settings 77 | "train-iters": 320000, 78 | "lr-decay-iters": 320000, 79 | "distributed-backend": "nccl", 80 | "lr-decay-style": "cosine", 81 | "warmup": 0.01, 82 | "checkpoint-factor": 10000, 83 | "eval-interval": 1000, 84 | "eval-iters": 10, 85 | 86 | # logging 87 | "log-interval": 100, 88 | "steps_per_print": 10, 89 | "keep-last-n-checkpoints": 4, 90 | "wall_clock_breakdown": true, 91 | } 92 | -------------------------------------------------------------------------------- /configs/125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe-parallel-size": 1, 6 | "model-parallel-size": 1, 7 | 8 | # model settings 9 | "num-layers": 12, 10 | "hidden-size": 768, 11 | "num-attention-heads": 12, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "no-weight-tying": true, 17 | "gpt_j_residual": false, 18 | "output_layer_parallelism": "column", 19 | 20 | # these should provide some speedup but takes a while to build, set to true if desired 21 | "scaled-upper-triang-masked-softmax-fusion": false, 22 | "bias-gelu-fusion": false, 23 | 24 | # init methods 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | 29 | # optimizer settings 30 | "optimizer": { 31 | "type": "Adam", 32 | "params": { 33 | "lr": 0.0006, 34 | "betas": [0.9, 0.95], 35 | "eps": 1.0e-8, 36 | } 37 | }, 38 | "min_lr": 0.00006, 39 | 40 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 41 | "zero_optimization": { 42 | "stage": 1, 43 | "allgather_partitions": True, 44 | "allgather_bucket_size": 500000000, 45 | "overlap_comm": True, 46 | "reduce_scatter": True, 47 | "reduce_bucket_size": 500000000, 48 | "contiguous_gradients": True, 49 | }, 50 | 51 | # batch / data settings 52 | "train_micro_batch_size_per_gpu": 4, 53 | "data-impl": "mmap", 54 | 55 | # activation checkpointing 56 | "checkpoint-activations": true, 57 | "checkpoint-num-layers": 1, 58 | "partition-activations": true, 59 | "synchronize-each-layer": true, 60 | 61 | # regularization 62 | "gradient_clipping": 1.0, 63 | "weight-decay": 0.1, 64 | "hidden-dropout": 0.0, 65 | "attention-dropout": 0.0, 66 | 67 | # precision settings 68 | "fp16": { 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | # misc. training settings 77 | "train-iters": 320000, 78 | "lr-decay-iters": 320000, 79 | "distributed-backend": "nccl", 80 | "lr-decay-style": "cosine", 81 | "warmup": 0.01, 82 | "checkpoint-factor": 10000, 83 | "eval-interval": 1000, 84 | "eval-iters": 10, 85 | 86 | # logging 87 | "log-interval": 100, 88 | "steps_per_print": 10, 89 | "keep-last-n-checkpoints": 4, 90 | "wall_clock_breakdown": true, 91 | 92 | # networking 93 | "hostfile": "/mock_path" 94 | } 95 | -------------------------------------------------------------------------------- /alon_configs/models/3B.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings, 2,846,767,360 parameters 3 | "num-layers": 32, 4 | "hidden-size": 2560, 5 | "num-attention-heads": 32, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "norm": "layernorm", 9 | "pos-emb": "rotary", 10 | "rotary-pct": 0.25, 11 | "no-weight-tying": true, 12 | "gpt-j-residual": true, 13 | "output-layer-parallelism": "column", 14 | 15 | "attention-config": [[["flash"], 32]], 16 | 17 | "scaled-upper-triang-masked-softmax-fusion": true, 18 | "bias-gelu-fusion": true, 19 | 20 | "train_batch_size": 480, 21 | "train_micro_batch_size_per_gpu": 10, 22 | "gradient_accumulation_steps": 6, 23 | "data-impl": "mmap", 24 | "num_workers": 1, 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00016, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.000016, 36 | 37 | "zero_optimization": { 38 | "stage": 1, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.1, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "initial_scale_power": 12, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1 69 | }, 70 | 71 | # train settings 72 | "train-iters": 100000, 73 | "lr-decay-iters": 100000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | 80 | # logging 81 | "log-interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | # tokenizer settings 86 | "tokenizer-type": "HFTokenizer", 87 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 88 | 89 | # wandb settings 90 | "use_wandb": true, 91 | "wandb_project": "neox", 92 | 93 | # domain specific settings 94 | "use_named_train_datasets": true, 95 | "use_named_eval_datasets": true, 96 | "max_validation_samples_per_dataset": 5000, 97 | 98 | # "keep_last_n_checkpoints": 1 99 | 100 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_410m_1gpu.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 24, 4 | "hidden-size": 1024, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 24]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.0003, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.00003, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1, 98 | 99 | "include": "localhost:0", 100 | "world_size": 1, 101 | "master_port": 12344, 102 | 103 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_1gpu_local1.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1, 98 | 99 | "include": "localhost:1", 100 | "world_size": 1, 101 | "master_port": 12341, 102 | 103 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_1gpu_local2.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1, 98 | 99 | "include": "localhost:2", 100 | "world_size": 1, 101 | "master_port": 12342, 102 | 103 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_1gpu_local3.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1, 98 | 99 | "include": "localhost:3", 100 | "world_size": 1, 101 | "master_port": 12343, 102 | 103 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_1gpu_local4.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1, 98 | 99 | "include": "localhost:4", 100 | "world_size": 1, 101 | "master_port": 12344, 102 | 103 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_1B_1gpu_local5.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings 3 | "num-layers": 16, 4 | "hidden-size": 2048, 5 | "num-attention-heads": 16, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 16]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | "train_batch_size": 10, 20 | "train_micro_batch_size_per_gpu": 10, 21 | "gradient_accumulation_steps": 1, 22 | "data-impl": "mmap", 23 | "num_workers": 1, 24 | 25 | # optimizer settings 26 | "optimizer": { 27 | "type": "Adam", 28 | "params": { 29 | "lr": 0.00025, 30 | "betas": [0.9, 0.95], 31 | "eps": 1.0e-8 32 | } 33 | }, 34 | "min_lr": 0.000025, 35 | 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 500000000, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 500000000, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | }, 46 | 47 | # activation checkpointing 48 | "checkpoint-activations": true, 49 | "checkpoint-num-layers": 1, 50 | "partition-activations": true, 51 | "synchronize-each-layer": true, 52 | 53 | # regularization 54 | "gradient_clipping": 1.0, 55 | "weight-decay": 0.1, 56 | "hidden-dropout": 0, 57 | "attention-dropout": 0, 58 | 59 | # precision settings 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | # train settings 71 | "train-iters": 100000, 72 | "lr-decay-iters": 100000, 73 | "distributed-backend": "nccl", 74 | "lr-decay-style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint-factor": 10000, 77 | "eval-interval": 1000, 78 | 79 | # logging 80 | "log-interval": 100, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | # tokenizer settings 85 | "tokenizer-type": "HFTokenizer", 86 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 87 | 88 | # wandb settings 89 | "use_wandb": true, 90 | "wandb_project": "neox", 91 | 92 | # domain specific settings 93 | "use_named_train_datasets": true, 94 | "use_named_eval_datasets": true, 95 | "max_validation_samples_per_dataset": 5000, 96 | 97 | "keep_last_n_checkpoints": 1, 98 | 99 | "include": "localhost:5", 100 | "world_size": 1, 101 | "master_port": 12345, 102 | 103 | } -------------------------------------------------------------------------------- /alon_configs/models/160m.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings, 162,322,944 parameters 3 | "num-layers": 12, 4 | "hidden-size": 768, 5 | "num-attention-heads": 12, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 12]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | # batch size (training on 1 gpu, takes up 37.5Gb of memory) 20 | "train_batch_size": 480, 21 | "train_micro_batch_size_per_gpu": 60, 22 | "gradient_accumulation_steps": 1, 23 | "data-impl": "mmap", 24 | "num_workers": 1, 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.0006, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.00006, 36 | 37 | "zero_optimization": { 38 | "stage": 1, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.1, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "initial_scale_power": 12, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1 69 | }, 70 | 71 | # train settings 72 | "train-iters": 100000, 73 | "lr-decay-iters": 100000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | # tokenizer settings 86 | "tokenizer-type": "HFTokenizer", 87 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 88 | 89 | # wandb settings 90 | "use_wandb": true, 91 | "wandb_project": "neox", 92 | 93 | # domain specific settings 94 | "use_named_train_datasets": true, 95 | "use_named_eval_datasets": true, 96 | "max_validation_samples_per_dataset": 5000, 97 | 98 | "keep_last_n_checkpoints": 1 99 | 100 | # "include": "localhost:0,1", 101 | # "world_size": 2, 102 | } -------------------------------------------------------------------------------- /alon_configs/models/eval_160m_1gpu.yml: -------------------------------------------------------------------------------- 1 | { 2 | # model settings, 162,322,944 parameters 3 | "num-layers": 12, 4 | "hidden-size": 768, 5 | "num-attention-heads": 12, 6 | "seq-length": 1024, 7 | "max-position-embeddings": 1024, 8 | "pos-emb": "rotary", 9 | "rotary-pct": 0.25, 10 | "no-weight-tying": true, 11 | "gpt-j-residual": true, 12 | "output-layer-parallelism": "column", 13 | 14 | "attention-config": [[["flash"], 12]], 15 | 16 | "scaled-upper-triang-masked-softmax-fusion": true, 17 | "bias-gelu-fusion": true, 18 | 19 | # batch size (training on 1 gpu, takes up 37.5Gb of memory) 20 | "train_batch_size": 10, 21 | "train_micro_batch_size_per_gpu": 10, 22 | "gradient_accumulation_steps": 1, 23 | "data-impl": "mmap", 24 | "num_workers": 1, 25 | 26 | # optimizer settings 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.0006, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | "min_lr": 0.00006, 36 | 37 | "zero_optimization": { 38 | "stage": 1, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 500000000, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 500000000, 44 | "contiguous_gradients": true, 45 | "cpu_offload": false 46 | }, 47 | 48 | # activation checkpointing 49 | "checkpoint-activations": true, 50 | "checkpoint-num-layers": 1, 51 | "partition-activations": true, 52 | "synchronize-each-layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight-decay": 0.1, 57 | "hidden-dropout": 0, 58 | "attention-dropout": 0, 59 | 60 | # precision settings 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "initial_scale_power": 12, 67 | "hysteresis": 2, 68 | "min_loss_scale": 1 69 | }, 70 | 71 | # train settings 72 | "train-iters": 100000, 73 | "lr-decay-iters": 100000, 74 | "distributed-backend": "nccl", 75 | "lr-decay-style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint-factor": 10000, 78 | "eval-interval": 1000, 79 | 80 | # logging 81 | "log-interval": 100, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | # tokenizer settings 86 | "tokenizer-type": "HFTokenizer", 87 | "vocab-file": "/share/edc/home/alon_albalak/tokenizers/20B_tokenizer.json", 88 | 89 | # wandb settings 90 | "use_wandb": true, 91 | "wandb_project": "neox", 92 | 93 | # domain specific settings 94 | "use_named_train_datasets": true, 95 | "use_named_eval_datasets": true, 96 | "max_validation_samples_per_dataset": 5000, 97 | 98 | "keep_last_n_checkpoints": 1, 99 | 100 | "include": "localhost:0", 101 | "world_size": 1, 102 | "master_port": 12344, 103 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # wandb logs 132 | wandb/ 133 | 134 | # data files 135 | data/**/*.idx 136 | data/**/*.bin 137 | data/**/*.json* 138 | data/**/*.txt 139 | data/**/*.gz 140 | data/**/*.np* 141 | data/**/*.npy 142 | checkpoints/ 143 | .vscode/ 144 | *.pt 145 | *.ckpt 146 | outputs/* 147 | 148 | #test logs 149 | test_checkpoint/ 150 | test_logs/ 151 | logs/ 152 | tensorboard/ 153 | src/ 154 | 155 | # configs 156 | alon_configs/run_specific/* -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Blendable dataset.""" 19 | 20 | import time 21 | 22 | import numpy as np 23 | import torch 24 | 25 | from megatron import print_rank_0 26 | from megatron import mpu 27 | 28 | 29 | class BlendableDataset(torch.utils.data.Dataset): 30 | def __init__(self, datasets, weights): 31 | self.datasets = datasets 32 | num_datasets = len(datasets) 33 | assert num_datasets == len(weights) 34 | 35 | self.size = 0 36 | for dataset in self.datasets: 37 | self.size += len(dataset) 38 | 39 | # Normalize weights. 40 | weights = np.array(weights, dtype=np.float64) 41 | sum_weights = np.sum(weights) 42 | assert sum_weights > 0.0 43 | weights /= sum_weights 44 | 45 | # Build indices. 46 | start_time = time.time() 47 | assert num_datasets < 255 48 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 49 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 50 | 51 | from megatron.data import helpers 52 | 53 | helpers.build_blending_indices( 54 | self.dataset_index, 55 | self.dataset_sample_index, 56 | weights, 57 | num_datasets, 58 | self.size, 59 | torch.distributed.get_rank() == 0, 60 | ) 61 | 62 | print( 63 | "> RANK {} elapsed time for building blendable dataset indices: " 64 | "{:.2f} (sec)".format( 65 | torch.distributed.get_rank(), time.time() - start_time 66 | ) 67 | ) 68 | 69 | def __len__(self): 70 | return self.size 71 | 72 | def __getitem__(self, idx): 73 | try: 74 | dataset_idx = self.dataset_index[idx] 75 | sample_idx = self.dataset_sample_index[idx] 76 | return self.datasets[dataset_idx][sample_idx] 77 | except IndexError: 78 | new_idx = idx % len(self) 79 | print( 80 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" 81 | ) 82 | return self[new_idx] 83 | -------------------------------------------------------------------------------- /tests/neox_args/test_neoxargs_usage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | plausibility check for the usage of neox_args in the megatron codebase 17 | """ 18 | import pytest 19 | import re 20 | from ..common import get_root_directory 21 | 22 | 23 | @pytest.mark.cpu 24 | def test_neoxargs_usage(): 25 | """ " 26 | checks for code pieces of the pattern "args.*" and verifies that such used arg is defined in NeoXArgs 27 | """ 28 | from megatron.neox_arguments import NeoXArgs 29 | 30 | declared_all = True 31 | neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys()) 32 | 33 | # we exclude a number of properties (implemented with the @property decorator) or functions that we know exists 34 | exclude = set( 35 | [ 36 | "params_dtype", 37 | "deepspeed_config", 38 | "get", 39 | "pop", 40 | "get_deepspeed_main_args", 41 | 'optimizer["params"]', 42 | "attention_config[layer_number]", 43 | "adlr_autoresume_object", 44 | "update_value", 45 | "all_config", 46 | "tensorboard_writer", 47 | "tokenizer", 48 | "train_batch_size]", 49 | "items", 50 | "configure_distributed_args", 51 | "build_tokenizer", 52 | "attention_config[i]", 53 | "print", 54 | ] 55 | ) 56 | 57 | # test file by file 58 | for filename in (get_root_directory() / "megatron").glob("**/*.py"): 59 | if filename.name in ["text_generation_utils.py", "train_tokenizer.py"]: 60 | continue 61 | 62 | # load file 63 | with open(filename, "r") as f: 64 | file_contents = f.read() 65 | 66 | # find args matches 67 | matches = list( 68 | re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents) 69 | ) 70 | if len(matches) == 0: 71 | continue 72 | 73 | # compare 74 | for match in matches: 75 | if match not in neox_args_attributes and match not in exclude: 76 | print( 77 | f"(arguments used not found in neox args): {filename.name}: {match}", 78 | flush=True, 79 | ) 80 | declared_all = False 81 | 82 | assert declared_all, "all arguments used in code defined in NeoXArgs" 83 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor); 26 | 27 | torch::Tensor bwd_cuda(torch::Tensor const& output_grads, 28 | torch::Tensor const& softmax_results, 29 | float scale_factor); 30 | 31 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) 32 | { 33 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 34 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 35 | (input.scalar_type() == at::ScalarType::BFloat16), 36 | "Only fp16 and bf16 are supported"); 37 | 38 | return fwd_cuda(input, scale_factor); 39 | } 40 | 41 | torch::Tensor bwd(torch::Tensor const& output_grads, 42 | torch::Tensor const& softmax_results, 43 | float scale_factor) 44 | { 45 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 46 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 47 | 48 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 49 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 50 | "Only fp16 and bf16 are supported"); 51 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 52 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | } // end namespace scaled_upper_triang_masked_softmax 59 | } // end namespace fused_softmax 60 | } // end namespace multihead_attn 61 | 62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 63 | { 64 | m.def("forward", 65 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 66 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 67 | m.def("backward", 68 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 69 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 70 | } 71 | -------------------------------------------------------------------------------- /megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | import torch 20 | 21 | 22 | def ensure_divisibility(numerator, denominator): 23 | """Ensure that numerator is divisible by the denominator.""" 24 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 25 | numerator, denominator 26 | ) 27 | 28 | 29 | def divide(numerator, denominator): 30 | """Ensure that numerator is divisible by the denominator and return 31 | the division value.""" 32 | ensure_divisibility(numerator, denominator) 33 | return numerator // denominator 34 | 35 | 36 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False): 37 | """Split a tensor along its last dimension. 38 | Arguments: 39 | tensor: input tensor. 40 | num_partitions: number of partitions to split the tensor 41 | contiguous_split_chunks: If True, make each chunk contiguous 42 | in memory. 43 | """ 44 | # Get the size and dimension. 45 | last_dim = tensor.dim() - 1 46 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 47 | # Split. 48 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 49 | # Note: torch.split does not create contiguous tensors by default. 50 | if contiguous_split_chunks: 51 | return tuple(chunk.contiguous() for chunk in tensor_list) 52 | 53 | return tensor_list 54 | 55 | 56 | class VocabUtility: 57 | """Split the vocabulary into `world_size` chunks amd return the 58 | first and last index of the vocabulary belonging to the `rank` 59 | partition: Note that indices in [first, last]""" 60 | 61 | @staticmethod 62 | def vocab_range_from_per_partition_vocab_size( 63 | per_partition_vocab_size, rank, world_size 64 | ): 65 | index_f = rank * per_partition_vocab_size 66 | index_l = index_f + per_partition_vocab_size 67 | return index_f, index_l 68 | 69 | @staticmethod 70 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 71 | per_partition_vocab_size = divide(global_vocab_size, world_size) 72 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 73 | per_partition_vocab_size, rank, world_size 74 | ) 75 | -------------------------------------------------------------------------------- /megatron/neox_arguments/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | NeoX Arguments manages all configuration arguments. 3 | 4 | **general** 5 | 6 | * The implementation makes use of the python dataclass. 7 | * The main class 'NeoXArgs' (in ./arguments) exposes all configuration attributes that are relevant to GPT NeoX 8 | * No attributes are nested (apart from attributes with type dict) 9 | * Output functions (enable_logging, save_yml, print) are implemented 10 | * Instantiation always runs NeoXArgs.__post_init__(), which calculates derived values and performs a validation (values, types, keys). 11 | * it is possible to set undefined attributes (e.g. line of code 'NeoXArgs().my_undefined_config = 42' works fine); such set attributes are not validated 12 | * It is possible to update attributes (e.g. line of code 'NeoXArgs().do_train = True' works fine); a validation can be performed by calling the validation functions on the class instance 13 | * In order to avoid setting undefined attributes you can use the function NeoXArgs().update_value(); this function raises an error if the to be set attribute is not defined 14 | 15 | **instantiation** 16 | NeoX args can be instantiated with the following options 17 | 18 | * NeoXArgs.from_ymls(["path_to_yaml1", "path_to_yaml2", ...]): load yaml configuration files and instantiate with the values provided; checks for duplications and unknown arguments are performed 19 | * NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed 20 | 21 | * NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main 22 | * NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance. 23 | 24 | 25 | **code structure** 26 | 27 | * NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen 28 | * The Subclasses group args according to their purpose 29 | * The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be omitted 30 | * The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be omitted 31 | * calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items()) 32 | * refer to docstrings in code for more information 33 | """ 34 | 35 | 36 | from .arguments import NeoXArgs 37 | -------------------------------------------------------------------------------- /configs/alon_config_small.yml: -------------------------------------------------------------------------------- 1 | { 2 | # parallelism settings 3 | "pipe-parallel-size": 1, 4 | "model-parallel-size": 1, 5 | 6 | # model settings, 162,322,944 parameters 7 | "num-layers": 12, 8 | "hidden-size": 768, 9 | "num-attention-heads": 12, 10 | "seq-length": 1024, 11 | "max-position-embeddings": 1024, 12 | "pos-emb": "rotary", 13 | "rotary-pct": 0.25, 14 | "no-weight-tying": true, 15 | "gpt-j-residual": true, 16 | "output-layer-parallelism": "column", 17 | 18 | "attention-config": [[["flash"], 12]], 19 | 20 | "scaled-upper-triang-masked-softmax-fusion": true, 21 | "bias-gelu-fusion": true, 22 | 23 | # batch size (training on 1 gpu, takes up 37.5Gb of memory) 24 | "train_batch_size": 480, 25 | "train_micro_batch_size_per_gpu": 60, 26 | "gradient_accumulation_steps": 8, 27 | "data-impl": "mmap", 28 | "num_workers": 1, 29 | 30 | # train settings 31 | "train-iters": 100000, 32 | "lr-decay-iters": 100000, 33 | "distributed-backend": "nccl", 34 | "lr-decay-style": "cosine", 35 | "warmup": 0.01, 36 | "checkpoint-factor": 1000, 37 | "eval-interval": 1000, 38 | "eval-iters": 10, 39 | 40 | "log-interval": 10, 41 | "steps_per_print": 10, 42 | "wall_clock_breakdown": true, 43 | 44 | # tokenizer settings 45 | "tokenizer-type": "GPT2BPETokenizer", 46 | "vocab-file": "data/gpt2-vocab.json", 47 | "merge-file": "data/gpt2-merges.txt", 48 | 49 | # wandb settings 50 | "use_wandb": true, 51 | "wandb_project": "neox", 52 | "wandb_group": "alon_config_small", 53 | 54 | # init methods 55 | "init_method": "small_init", 56 | "output_layer_init_method": "wang_init", 57 | 58 | # optimizer settings 59 | "optimizer": { 60 | "type": "Adam", 61 | "params": { 62 | "lr": 0.001, 63 | "betas": [0.9, 0.95], 64 | "eps": 1.0e-8 65 | } 66 | }, 67 | "min_lr": 0.0001, 68 | 69 | "zero_optimization": { 70 | "stage": 1, 71 | "allgather_partitions": true, 72 | "allgather_bucket_size": 500000000, 73 | "overlap_comm": true, 74 | "reduce_scatter": true, 75 | "reduce_bucket_size": 500000000, 76 | "contiguous_gradients": true, 77 | "cpu_offload": false 78 | }, 79 | 80 | # activation checkpointing 81 | "checkpoint-activations": true, 82 | "checkpoint-num-layers": 1, 83 | "partition-activations": true, 84 | "synchronize-each-layer": true, 85 | 86 | # regularization 87 | "gradient_clipping": 1.0, 88 | "weight-decay": 0.1, 89 | "hidden-dropout": 0, 90 | "attention-dropout": 0, 91 | 92 | # precision settings 93 | "fp16": { 94 | "fp16": true, 95 | "enabled": true, 96 | "loss_scale": 0, 97 | "loss_scale_window": 1000, 98 | "initial_scale_power": 12, 99 | "hysteresis": 2, 100 | "min_loss_scale": 1 101 | }, 102 | 103 | "train-data-paths": ["data/openwebtext-processed_text_document"], 104 | "valid-data-paths": ["data/openwebtext-processed_text_document"], 105 | "test-data-paths": ["data/openwebtext-processed_text_document"], 106 | 107 | # to use a single GPU 108 | # "include": "localhost:7", 109 | # "world_size": 1, 110 | } -------------------------------------------------------------------------------- /megatron/model/norms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from torch.nn import LayerNorm as LayerNorm 17 | 18 | 19 | def get_norm(neox_args): 20 | if neox_args.norm == "rmsnorm": 21 | norm = RMSNorm 22 | eps = neox_args.rms_norm_epsilon 23 | elif neox_args.norm == "layernorm": 24 | eps = neox_args.layernorm_epsilon 25 | norm = LayerNorm 26 | elif neox_args.norm == "scalenorm": 27 | eps = neox_args.scalenorm_epsilon 28 | norm = ScaleNorm 29 | else: 30 | raise ValueError(f"norm {neox_args.norm} not recognized") 31 | return norm, eps 32 | 33 | 34 | class RMSNorm(torch.nn.Module): 35 | def __init__(self, dim, p=-1.0, eps=1e-8, bias=False): 36 | """ 37 | Root Mean Square Layer Normalization 38 | :param dim: model size 39 | :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled) 40 | :param eps: epsilon value, default 1e-8 41 | :param bias: whether use bias term for RMSNorm, disabled by 42 | default because RMSNorm doesn't enforce re-centering invariance. 43 | """ 44 | super(RMSNorm, self).__init__() 45 | 46 | self.eps = eps 47 | self.d = dim 48 | self.p = p 49 | self.bias = bias 50 | 51 | self.scale = torch.nn.Parameter(torch.ones(dim)) 52 | self.register_parameter("scale", self.scale) 53 | 54 | if self.bias: 55 | self.offset = torch.nn.Parameter(torch.zeros(dim)) 56 | self.register_parameter("offset", self.offset) 57 | 58 | def forward(self, x): 59 | if self.p < 0.0 or self.p > 1.0: 60 | norm_x = x.norm(2, dim=-1, keepdim=True) 61 | d_x = self.d 62 | else: 63 | partial_size = int(self.d * self.p) 64 | partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1) 65 | 66 | norm_x = partial_x.norm(2, dim=-1, keepdim=True) 67 | d_x = partial_size 68 | 69 | rms_x = norm_x * d_x ** (-1.0 / 2) 70 | x_normed = x / (rms_x + self.eps) 71 | 72 | if self.bias: 73 | return self.scale * x_normed + self.offset 74 | 75 | return self.scale * x_normed 76 | 77 | 78 | class ScaleNorm(torch.nn.Module): 79 | def __init__(self, dim, eps=1e-5): 80 | super().__init__() 81 | self.g = torch.nn.Parameter(torch.ones(1)) 82 | self.eps = eps 83 | 84 | def forward(self, x): 85 | n = torch.norm(x, dim=-1, keepdim=True).clamp(min=self.eps) 86 | return x / n * self.g 87 | -------------------------------------------------------------------------------- /configs/gen_docs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.append( 5 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 6 | ) 7 | from megatron.neox_arguments import neox_args, deepspeed_args 8 | from inspect import getmembers, getsource 9 | from dataclasses import field, is_dataclass 10 | from itertools import tee, zip_longest 11 | import pathlib 12 | 13 | 14 | def pairwise(iterable): 15 | "s -> (s0,s1), (s1,s2), (s2, s3), ..." 16 | a, b = tee(iterable) 17 | next(b, None) 18 | return zip_longest(a, b) 19 | 20 | 21 | def get_docs(module): 22 | ARGS_CLASSES = getmembers(module, is_dataclass) 23 | results = {} 24 | for name, dcls in ARGS_CLASSES: 25 | assert is_dataclass(dcls) 26 | src = getsource(dcls) 27 | d = dcls() 28 | loc = 0 29 | results[name] = {"doc": d.__doc__.strip(), "attributes": {}} 30 | for cur, _next in pairwise(d.__dataclass_fields__.items()): 31 | field_name, field_def = cur 32 | field_type = field_def.type 33 | if hasattr(field_type, "__name__"): 34 | field_type = field_type.__name__ 35 | else: 36 | field_type = str(field_type) 37 | 38 | field_default = field_def.default 39 | 40 | # try to find the field definition 41 | loc = src.find(f" {field_name}:", loc + len(field_name) + 1) 42 | 43 | if _next is not None: 44 | next_field_name, _ = _next 45 | # try to find the next field definition 46 | next_loc = src.find(f"{next_field_name}:", loc + len(field_name)) 47 | else: 48 | next_loc = len(src) 49 | 50 | # try to get the docstring 51 | _src = src[loc:next_loc].strip() 52 | if '"""' in _src: 53 | doc = _src.split('"""')[1].strip() 54 | elif "'''" in _src: 55 | doc = _src.split("'''")[1].strip() 56 | else: 57 | doc = "" 58 | results[name]["attributes"][field_name] = { 59 | "name": field_name, 60 | "type": field_type, 61 | "default": field_default, 62 | "doc": doc, 63 | } 64 | return results 65 | 66 | 67 | def to_md(docs, intro_str=""): 68 | """ 69 | Writes the docs dictionary to markdown format 70 | """ 71 | lines = [] 72 | lines.append(intro_str) 73 | for name, doc in docs.items(): 74 | lines.append(f"## {name}") 75 | lines.append(f"{doc['doc']}") 76 | lines.append("") 77 | for field_name, field_def in doc["attributes"].items(): 78 | # attribute name and type 79 | lines.append(f"- **{field_name}**: {field_def['type']}") 80 | # default value 81 | lines.append(f" Default = {str(field_def['default'])}") 82 | lines.append(f" {field_def['doc']}") 83 | lines.append("") 84 | return "\n\n".join(lines) 85 | 86 | 87 | if __name__ == "__main__": 88 | docs = get_docs(neox_args) 89 | docs.update(get_docs(deepspeed_args)) 90 | intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n""" 91 | md = to_md(docs, intro_str=intro_str) 92 | with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f: 93 | f.write(md) 94 | --------------------------------------------------------------------------------