├── LICENSE ├── README.md ├── compiler_fx ├── LICENSE ├── README.md ├── deep_gpt2-xl.py ├── deep_llama-8b.py ├── deepspeed_gpt2.py ├── deepspeed_pp_training.py ├── example_train_gpu.py ├── fx_dist_inference_type-A.py ├── fx_dist_inference_type-B.py ├── fx_dist_pp_dp_training_type-A_gpt2_gpu.py ├── fx_dist_pp_dp_training_type-A_gpu.py ├── fx_dist_pp_dp_training_type-A_gpu_validate.py ├── fx_dist_pp_training_type-A.py ├── fx_dist_pp_training_type-A_bert.py ├── fx_dist_pp_training_type-A_gpt2-medium.py ├── fx_dist_pp_training_type-A_gpt2.py ├── fx_dist_pp_training_type-A_gpt2_gpu.py ├── fx_dist_pp_training_type-A_gpt2_gpu_mopt1.py ├── fx_dist_pp_training_type-A_gpu.py ├── fx_dist_pp_training_type-A_gpu_measure-flops.py ├── fx_dist_pp_training_type-A_gpu_mopt1.py ├── fx_dist_pp_training_type-A_mopt1.py ├── fx_dist_pp_training_type-A_remove-fwdcache.py ├── fx_dist_pp_training_type-A_transformer.py ├── fx_dist_pp_training_type-B.py ├── fx_dist_pp_training_type-B_transformer.py ├── fx_dist_pp_training_type-C_bert_gpu.py ├── fx_dist_pp_training_type-C_gpt-neo_gpu.py ├── fx_dist_pp_training_type-C_gpt2-large_gpu.py ├── fx_dist_pp_training_type-C_gpt2-medium_gpu.py ├── fx_dist_pp_training_type-C_gpt2-xl_gpu.py ├── fx_dist_pp_training_type-C_gpt2_gpu.py ├── fx_dist_pp_training_type-C_gptj_gpu.py ├── fx_dist_pp_training_type-C_gpu.py ├── fx_dist_training_type-B.py ├── fx_dist_training_type-B_many-process.py ├── fx_inference.py ├── fx_inference_restructured.py ├── fx_ir_transfer.py ├── fx_split_graph.py ├── fx_split_range_traversal.py ├── fx_train.py ├── fx_train_extended.py ├── fx_train_with_backward_IR.py ├── fx_train_with_forwardonly_IR.py ├── fx_transformer.py ├── llama_2d.py ├── memory_usage.py ├── memory_usage2.py ├── memory_usage3.py ├── name_map_test.py ├── pippy-2_pp_training.py ├── pippy_pp_dp_training.py ├── pippy_pp_dp_training_gpt2.py ├── pippy_pp_training.py ├── pippy_pp_training_gpt2-large.py ├── pippy_pp_training_gpt2-medium.py ├── pippy_pp_training_gpt2-xl.py ├── pippy_pp_training_gpt2.py ├── py_comp.py ├── tf_comp.py ├── util_find_linear.py ├── util_setup_mesh.py ├── varuna_pp_dp_training.py ├── varuna_pp_training.py └── vit_train.py ├── gpt-neox ├── .clang-format ├── .dockerignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README-MUP.md ├── README.md ├── configs │ ├── 1.3B-32k-len-conf.yml │ ├── 125M-32k-len-conf.yml │ ├── 125M.yml │ ├── 2-7B.yml │ ├── 2.7B-32k-len-conf.yml │ ├── 20B.yml │ ├── 250M-32k-len-conf.yml │ ├── 6-7B.yml │ ├── 6.7B-32k-len-conf.yml │ ├── 6.7B-32k-len-conf.yml_org │ ├── 760M-32k-len-conf.yml │ ├── README.md │ ├── autotuning_configs │ │ ├── small_tune.json │ │ ├── tune.json │ │ ├── tune_1-3B.json │ │ └── tune_6-7B.json │ ├── enwik8.yml │ ├── etri_cluster.yml │ ├── finetuning_configs │ │ └── 6-9B.yml │ ├── gen_docs.py │ ├── llama │ │ ├── 13B.yml │ │ ├── 30B.yml │ │ ├── 65B.yml │ │ ├── 7B.yml │ │ ├── README.md │ │ └── train_config.yml │ ├── neox_arguments.md │ ├── org │ │ ├── 1-3B.yml │ │ ├── 125M-json.yml │ │ ├── 13B.yml │ │ ├── 175B.yml │ │ ├── 19M.yml │ │ ├── 250M.yml │ │ ├── 350M.yml │ │ ├── 49M.yml │ │ ├── 760M.yml │ │ ├── 800M.yml │ │ ├── bf16_125M.yml │ │ ├── bnb_125M.yml │ │ ├── cpu_mock_config.yml │ │ ├── eleutherai_cluster.yml │ │ ├── gmlp_small.yml │ │ ├── local_setup.yml │ │ ├── slurm_125M.yml │ │ ├── slurm_local.yml │ │ ├── sparse.yml │ │ ├── test.yml │ │ └── text_generation.yml │ ├── pile.yml │ ├── pythia │ │ ├── 1-4B.yml │ │ ├── 12B.yml │ │ ├── 160M.yml │ │ ├── 1B.yml │ │ ├── 2-8B.yml │ │ ├── 410M.yml │ │ ├── 6-9B.yml │ │ └── 70M.yml │ └── slurm_local.json ├── deepy.py ├── eval_tasks │ ├── __init__.py │ └── eval_adapter.py ├── evaluate.py ├── generate.py ├── hostfile ├── megatron │ ├── __init__.py │ ├── checkpointing.py │ ├── data │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── blendable_dataset.py │ │ ├── data_utils.py │ │ ├── gpt2_dataset.py │ │ ├── helpers.cpp │ │ ├── helpers.cpython-38-x86_64-linux-gnu.so │ │ ├── indexed_dataset.py │ │ └── samplers.py │ ├── fused_kernels │ │ ├── __init__.py │ │ ├── compat.h │ │ ├── scaled_masked_softmax.cpp │ │ ├── scaled_masked_softmax.h │ │ ├── scaled_masked_softmax_cuda.cu │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ ├── scaled_upper_triang_masked_softmax.h │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ ├── setup.py │ │ ├── tests │ │ │ └── test_fused_kernels.py │ │ └── type_shim.h │ ├── gradient_noise_scale │ │ ├── __init__.py │ │ └── gradient_noise_scale.py │ ├── initialize.py │ ├── learning_rates.py │ ├── logging.py │ ├── model │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── flash_attention.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_softmax.py │ │ ├── gmlp.py │ │ ├── gpt2_model.py │ │ ├── init_functions.py │ │ ├── norms.py │ │ ├── positional_embeddings.py │ │ ├── transformer.py │ │ ├── utils.py │ │ └── word_embeddings.py │ ├── mpu │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── initialize.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── mup_substitute.py │ ├── neox_arguments │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── deepspeed_args.py │ │ ├── neox_args.py │ │ └── template.py │ ├── optimizers.py │ ├── text_generation_utils.py │ ├── tokenizer │ │ ├── __init__.py │ │ ├── gpt2_tokenization.py │ │ ├── tokenizer.py │ │ └── train_tokenizer.py │ ├── training.py │ └── utils.py ├── prepare_data.py ├── requirements │ ├── requirements-dev.txt │ ├── requirements-flashattention.txt │ ├── requirements-onebitadam.txt │ ├── requirements-s3.txt │ ├── requirements-sparseattention.txt │ ├── requirements-tensorboard.txt │ ├── requirements-wandb.txt │ └── requirements.txt ├── scripts_swsok │ ├── 0.remove_nvidia_driver_and_cuda.sh │ ├── 1.cuda_11_7_install.sh │ ├── 10.run_and_collect_logs_single.sh │ ├── 11.cat_csv_from_log.sh │ ├── 12.run_and_collect_logs_multi.sh │ ├── 13.run_20B_and_collect_logs_multi.sh │ ├── 14.long_seqlen_6.7B.sh │ ├── 15.long_seqlen_1.3B.sh │ ├── 16.zero_opt_stages_1.3B.sh │ ├── 17.760M_zero_stages.sh │ ├── 2.docker_and_nvidia_container_toolkit_install.sh │ ├── 3.required_packages_install.sh │ ├── 4.requirements_install.sh │ ├── 5.prepare_dataset.sh │ ├── 6.pretrain_125M_local.sh │ ├── 7.patch_best_download.sh │ ├── 8.print_loss_progress.sh │ ├── 9.run_docker.sh │ └── run_sshd.sh ├── seccomp-docker.json ├── tests │ ├── README.md │ ├── __init__.py │ ├── common.py │ ├── model │ │ ├── __init__.py │ │ ├── test_fused_kernels.py │ │ ├── test_model_checkpoint.py │ │ ├── test_model_generation.py │ │ ├── test_model_instantiation.py │ │ └── test_model_train.py │ ├── neox_args │ │ ├── __init__.py │ │ ├── test_neoxargs_commandline.py │ │ ├── test_neoxargs_implementation.py │ │ ├── test_neoxargs_load.py │ │ └── test_neoxargs_usage.py │ ├── pytest.ini │ └── test_configs │ │ └── test_train_base.yml ├── tools │ ├── README.md │ ├── __init__.py │ ├── bash │ │ ├── README.md │ │ ├── kill.sh │ │ ├── killall.sh │ │ ├── sync.sh │ │ ├── sync_cmd.sh │ │ └── syncdir.sh │ ├── ckpts │ │ ├── README.md │ │ ├── convert_hf_to_sequential.py │ │ ├── convert_module_to_hf.py │ │ ├── convert_raw_llama_weights_to_neox.py │ │ ├── convert_sequential_to_hf.py │ │ ├── inspect_checkpoints.py │ │ ├── merge20b.py │ │ └── upload.py │ ├── convert_hf_to_sequential.py │ ├── convert_module_to_hf.py │ ├── convert_raw_llama_weights_to_neox.py │ ├── convert_sequential_to_hf.py │ ├── corpora.py │ ├── datasets │ │ ├── README.md │ │ ├── corpora.py │ │ ├── merge_datasets.py │ │ ├── multinode_prepare_data.sh │ │ ├── preprocess_data.py │ │ └── preprocess_data_with_mask.py │ ├── inspect_checkpoints.py │ ├── kill.sh │ ├── killall.sh │ ├── merge20b.py │ ├── merge_datasets.py │ ├── merge_mp_partitions.py │ ├── multinode_prepare_data.sh │ ├── preprocess_data.py │ ├── preprocess_data_with_mask.py │ ├── sync.sh │ ├── sync_cmd.sh │ ├── syncdir.sh │ └── upload.py └── train.py ├── k8s_kubeflow_install ├── README.md ├── common │ ├── 00-prepare-nodes.sh │ ├── 01-install-cudnn-and-nvidia-driver.sh │ ├── 02-install-docker.sh │ ├── 03-install-nvidia-docker.sh │ ├── 04-install-k8s.sh │ ├── 05-init-k8s-master-only.sh │ ├── 06-install-kubeflow-master-only.sh │ ├── 07-certificate-kubeflow-master-only.sh │ ├── 08-port-forward-kubeflow-master-only.sh │ ├── 09-print-join-cmd.sh │ ├── 10-enable-k8s-dashboard-master-only.sh │ ├── 11-reset-k8s.sh │ ├── 12-add-kubeflow-user.sh │ ├── 13-port-forward-k8s-container.sh │ ├── 14-remove-a-node.sh │ ├── certificate.yaml │ ├── cluster-role-binding.yaml │ ├── dashboard-adminuser.yaml │ ├── gateway.yaml │ ├── profile.yaml │ ├── profile1.yaml │ ├── profile2.yaml │ ├── profile3.yaml │ ├── profile4.yaml │ ├── profile5.yaml │ ├── profile6.yaml │ ├── profile7.yaml │ ├── profile8.yaml │ └── profile9.yaml ├── docker │ ├── Dockerfile │ ├── Dockerfile.org │ ├── Dockerfile.scratch │ ├── cuda-requirements.txt │ ├── make_dockerimage.sh │ ├── requirements.txt │ └── s6 │ │ ├── cont-init.d │ │ └── 01-copy-tmp-home │ │ └── services.d │ │ └── jupyterlab │ │ └── run ├── setup_for_gpu_node_master.sh └── setup_for_gpu_node_worker.sh ├── llama3_inference ├── README.md ├── llama3_inference_basic.py └── llama3_inference_memory_offload.py ├── mlperf ├── README.md └── pytorch-22.09 │ ├── Dockerfile │ ├── LICENSE │ ├── NOTICE │ ├── README.md │ ├── README_2xa30_ngc22.09_pytorch.md │ ├── README_dgxa100_n512_ngc22.09_pytorch.md │ ├── README_dgxa100_n8_ngc22.09_pytorch.md │ ├── README_dgxa100_ngc22.09_pytorch.md │ ├── a30-run_and_time.sh │ ├── a30.sub │ ├── bmm1.py │ ├── bmm2.py │ ├── cleanup_scripts │ ├── chop_hdf5_files.py │ ├── clean.sh │ ├── cleanup_file.py │ ├── create_pretraining_data_wrapper.sh │ ├── create_wiki_test_set_md5_hashes.py │ ├── dataset_stats.py │ ├── do_gather.py │ ├── do_sentence_segmentation.py │ ├── extract_test_set_articles.py │ ├── parallel_create_hdf5.sh │ ├── process_wiki.sh │ ├── reshard_hdf5_files.py │ ├── transparency_in_test_set_generation.py │ └── wiki_test_set_md5.txt │ ├── config_A30_1x2x224x14.sh │ ├── config_A40_1x2x224x14.sh │ ├── config_DGXA100_1x4x56x2.sh │ ├── config_DGXA100_1x8x56x1.sh │ ├── config_DGXA100_4gpu_common.sh │ ├── config_DGXA100_512x8x2x1_pack.sh │ ├── config_DGXA100_8x8x48x1.sh │ ├── config_DGXA100_common.sh │ ├── convert_tf_checkpoint.py │ ├── extract_features.py │ ├── file_utils.py │ ├── fmha.py │ ├── function.py │ ├── fwd_loss_bwd_trainer.py │ ├── inference.py │ ├── input_preprocessing │ ├── 2048_shards_varlength.chk │ ├── 4320_shards_varlength.chk │ ├── chop_hdf5_files.py │ ├── chop_hdf5_files_to_varlength.py │ ├── clean.sh │ ├── cleanup_file.py │ ├── convert_fixed2variable.py │ ├── create_pretraining_data.py │ ├── create_pretraining_data_wrapper.sh │ ├── do_gather.py │ ├── do_sentence_segmentation.py │ ├── eval.md5 │ ├── eval_varlength.chk │ ├── hdf5_md5.py │ ├── packed_data │ │ ├── README.md │ │ ├── create_packed_trainset.py │ │ ├── create_per_seqlength_data.py │ │ ├── gather_per_seqlength_data.py │ │ ├── generate_packing_strategy.py │ │ └── prepare_packed_data.sh │ ├── parallel_create_hdf5.sh │ ├── pick_eval_samples.py │ ├── pick_eval_samples_varlength.py │ ├── prepare_data.sh │ ├── process_wiki.sh │ ├── seperate_test_set.py │ ├── shuffle_samples.py │ ├── shuffle_samples_write.py │ ├── shuffle_samples_write.py_bak │ └── tokenization.py │ ├── mha.py │ ├── mhalib │ ├── mha_funcs.cu │ └── setup.py │ ├── mlperf_logger.py │ ├── model │ ├── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── attention.py │ │ ├── embeddings.py │ │ ├── fused.py │ │ └── layernorm.py │ ├── losses │ │ └── __init__.py │ └── models │ │ └── __init__.py │ ├── modeling.py │ ├── mounts.txt │ ├── optim │ └── distributed_fused_lamb.py │ ├── optimization.py │ ├── padding.py │ ├── requirements.txt │ ├── run.sub │ ├── run_and_time.sh │ ├── run_pretraining.py │ ├── run_squad.py │ ├── run_test.sh │ ├── run_with_docker.sh │ ├── scaleoutbridge.py │ ├── schedulers.py │ ├── scripts │ └── run_pretraining.sh │ ├── softmax.py │ ├── tokenization.py │ ├── unit_test │ ├── __init__.py │ ├── global_vars.py │ ├── test_bert_batch_1.py │ ├── test_bert_batch_7.py │ ├── test_data_path.py │ ├── test_embeddings_batch_1.py │ ├── test_encoders_batch_1.py │ ├── test_main.py │ └── unit_test_utils.py │ └── utils.py ├── opt_prime ├── README.md ├── demo │ ├── README.md │ ├── pp_train_gpt2.py │ ├── pp_train_llama-13b.py │ └── pp_train_llama-8b.py ├── examples │ ├── pp_train_bert.py │ ├── pp_train_electra.py │ ├── pp_train_gpt-neo.py │ ├── pp_train_gpt2-large.py │ ├── pp_train_gpt2-medium.py │ ├── pp_train_gpt2-xl-flops.py │ ├── pp_train_gpt2-xl.py │ ├── pp_train_gpt2.py │ ├── pp_train_gpt2_autocast.py │ ├── pp_train_gpt2_seq-cls.py │ ├── pp_train_gptj.py │ ├── pp_train_gptj2.py │ ├── pp_train_llama-small.py │ ├── pp_train_llama.py │ ├── pp_train_llama2.py │ ├── pp_train_llama3.py │ ├── pp_train_llama4.py │ ├── pp_train_llama5.py │ ├── pp_train_llama6.py │ ├── pp_train_llama7.py │ ├── pp_train_llama_autocast.py │ ├── pp_train_opt.py │ ├── pp_train_opt2.py │ ├── pp_train_synthetic.py │ ├── pp_train_synthetic2.py │ ├── pp_train_vit.py │ └── pp_train_whisper.py └── opt_prime │ ├── IR.py │ ├── __init__.py │ ├── comm.py │ ├── opti_pri.py │ └── schedule.py └── torchgpipe_OOO_PP ├── LICENSE ├── README.md ├── gpipe_opt_synthetic3.py ├── gpipe_opt_synthetic3_gpu.py ├── gpipe_opt_synthetic4_gpu.py ├── gpipe_opt_synthetic5_gpu.py ├── gpipe_opt_transformer_gpu.py ├── gpipe_static_opt_synthetic2.py ├── gpipe_static_opt_synthetic2_gpu.py ├── gpipe_synthetic1.py └── gpipe_transformer_gpu.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022-present, ETRI, All rights reserved. 4 | 5 | From PyTorch: 6 | Copyright (c) 2014- Facebook, Inc, All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | 18 | 3. Neither the name of the copyright holder nor the names of its 19 | contributors may be used to endorse or promote products derived from 20 | this software without specific prior written permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 28 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # High-efficiency AI computing SW core technology project (AIcomp) 2 | 3 | This project is for the development of low-cost and high-efficiency AI computing platform technology to overcome the inefficiency of excessive computing resource usage required for training large models and the dependency on specific high-cost hyperclusters for training such models. 4 | 5 | We are developing a parallelization framework software called OptimusPrime, which now provides two-dimensional parallelization of pipeline parallel/data parallel and memory-efficient optimization features ( [opt_prime folder](./opt_prime) ). 6 | 7 | 8 | Additionally, we have developed multiple PoCs related to this: In the early stages of this project, we presented the results of developing training PoCs that integrated Out Of Order technology (https://dl.acm.org/doi/pdf/10.1145/3492321.3519563) on top of torchgpipe ( [torchgpipe_OOO_PP folder](./torchgpipe_OOO_PP) ). In the next stage, we developed multiple PoCs that extract IR from the model and perform distributed training by partitioning it across multiple GPUs ( [compiler_fx folder](./compiler_fx) ). 9 | 10 | 11 | In addition, we want to apply 3D to the model based on the compiler. In this regard, related PoCs are being developed preemptively, and the ETRI framework SW will be developed in earnest in the second half of the year. 12 | 13 | 14 | ## Features 15 | 16 | An open-source AI training framework that provides automatic parallelization without model modifications ( [opt_prime](./opt_prime) ) 17 | 18 | * Enabling general model application for parallelization by removing constraints on model representation (compatible with Hugging Face models and PyTorch nn.Module) 19 | * Automatic parallelization (model split) without user intervention 20 | * Distributed parallel runtime supporting both Intra/Inter hosts concurrently (currently supports PP + DP) 21 | * An IR-based system aiming for flexible optimization at a global level 22 | * Memory optimization technology for CPU/GPU memory OOM avoidance 23 | 24 | 25 | ## License 26 | 27 | The results of the AIcomp project are distributed under the 3-clause BSD license. 28 | -------------------------------------------------------------------------------- /compiler_fx/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022-present, ETRI, All rights reserved. 4 | 5 | From PyTorch: 6 | Copyright (c) 2014- Facebook, Inc, All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | 18 | 3. Neither the name of the copyright holder nor the names of its 19 | contributors may be used to endorse or promote products derived from 20 | this software without specific prior written permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 28 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /compiler_fx/README.md: -------------------------------------------------------------------------------- 1 | ## 3D parallelism by compiler based FX IR 2 | 3 | Current: FX IR-based Pipeline Parallelism PoC, FX IR manipulation PoC, and so on 4 | 5 | Future work: Integration into Pytorch 2.0's compiler mechanism 6 | 7 | ## License 8 | 9 | The results of the AIcomp project are distributed under the 3-clause BSD license. 10 | -------------------------------------------------------------------------------- /compiler_fx/memory_usage2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.parameter import Parameter 3 | import torch.nn as nn 4 | from torch import Tensor 5 | import torch.nn.functional as F 6 | from torch.nn import init 7 | import math 8 | 9 | import psutil 10 | import os 11 | pid = os.getpid() 12 | print(f">> Process ID: {pid}") 13 | 14 | #use_reset_parameters = False 15 | use_reset_parameters = True 16 | 17 | print_flag = True 18 | 19 | def print_memory_usage(str, print_flag): 20 | if print_flag == True: 21 | print(" =========", str, "=========") 22 | my_process = psutil.Process(pid) 23 | usage = my_process.memory_info().rss / (1024 ** 3) # GB unit 24 | print(f" Memory Usage: {usage:.3f} GB") 25 | 26 | 27 | class Linear2(nn.Module): 28 | __constants__ = ['in_features', 'out_features'] 29 | in_features: int 30 | out_features: int 31 | weight: torch.Tensor 32 | 33 | def __init__(self, in_features: int, out_features: int): 34 | super(Linear2, self).__init__() 35 | self.in_features = in_features 36 | self.out_features = out_features 37 | 38 | self.weight = Parameter(torch.empty((out_features, in_features))) 39 | self.bias = Parameter(torch.empty(out_features)) 40 | 41 | if use_reset_parameters == True: 42 | self.reset_parameters() 43 | 44 | def forward(self, input: Tensor) -> Tensor: 45 | return F.linear(input, self.weight, self.bias) 46 | 47 | def reset_parameters(self) -> None: 48 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 49 | 50 | fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) 51 | bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 52 | init.uniform_(self.bias, -bound, bound) 53 | 54 | print_memory_usage("Before: m = Linear2(10000, 20000)", print_flag) 55 | 56 | m = Linear2(10000, 20000) 57 | 58 | print(f" ***** use reset_parameters() : {use_reset_parameters} *****") 59 | print_memory_usage("After: m = Linear2(10000, 20000)", print_flag) 60 | print(f"{m.weight}") 61 | input = torch.randn(20000, 10000) 62 | print_memory_usage("After: input = torch.randn(20000, 10000)", print_flag) 63 | output = m(input) 64 | print_memory_usage("After: output = m(input)", print_flag) 65 | print(f"{output.size()}") 66 | 67 | -------------------------------------------------------------------------------- /compiler_fx/util_find_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModel 3 | import sys 4 | import os 5 | 6 | def find_linear_modules(model): 7 | linear_modules = [] 8 | 9 | for name, module in model.named_modules(): 10 | if isinstance(module, torch.nn.Linear): 11 | linear_modules.append((name, module)) 12 | 13 | return linear_modules 14 | 15 | def find_linear_modules2(model): 16 | linear_modules = [] 17 | 18 | def recursive_search(prefix, module): 19 | for name, sub_module in module.named_children(): 20 | full_name = f"{prefix}.{name}" if prefix else name 21 | if isinstance(sub_module, torch.nn.Linear): 22 | #if isinstance(sub_module, torch.nn.Linear) or isinstance(sub_module, torch.nn.Conv1d) or hasattr(sub_module, "weight"): 23 | linear_modules.append((full_name, sub_module)) 24 | else: 25 | recursive_search(full_name, sub_module) 26 | 27 | recursive_search("", model) 28 | 29 | return linear_modules 30 | 31 | if __name__ == "__main__": 32 | #print(f"len(sys.argv) --> {len(sys.argv)}") 33 | #print(f"sys.argv[0] --> {sys.argv[0]}") 34 | 35 | 36 | #model_name = "bert-base-uncased" 37 | #model_name = "openai/whisper-base" 38 | model_name = "facebook/opt-350m" 39 | 40 | model = AutoModel.from_pretrained(model_name) 41 | 42 | #print(f"model: {model}") 43 | 44 | linear_layers = find_linear_modules(model) 45 | #linear_layers = find_linear_modules2(model) 46 | 47 | if linear_layers: 48 | for name, module in linear_layers: 49 | print(f"- {name}: {module}") 50 | print(f">> found: {len(linear_layers)} nn.Linear") 51 | else: 52 | print("Not find nn.Linear") 53 | 54 | -------------------------------------------------------------------------------- /compiler_fx/util_setup_mesh.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import math 5 | 6 | import torch 7 | import torch.distributed as dist 8 | 9 | from torch.distributed.device_mesh import init_device_mesh 10 | 11 | 12 | rank = int(os.environ["RANK"]) 13 | local_rank = int(os.environ["LOCAL_RANK"]) 14 | world_size = int(os.environ["WORLD_SIZE"]) 15 | master_addr = os.getenv("MASTER_ADDR") 16 | master_port = os.getenv("MASTER_PORT") 17 | 18 | # 19 | # world size 8 20 | # 21 | pp_size = 2 22 | tp_size = 2 23 | dp_size = 2 24 | 25 | # 26 | # world size 16 27 | # 28 | #pp_size = 8 29 | #tp_size = 2 30 | #dp_size = 1 31 | 32 | # 33 | # world size 12 34 | # 35 | #pp_size = 3 36 | #dp_size = 2 37 | #tp_size = 2 38 | 39 | 40 | assert world_size == pp_size * dp_size * tp_size, f"pp_size({pp_size}) * dp_size({dp_size}) * tp_size({tp_size}) must be equal to world_size({world_size})" 41 | assert world_size % tp_size == 0, f"world size({world_size}) must be divisible by tp size({tp_size})" 42 | assert world_size % dp_size == 0, f"world size({world_size}) must be divisible by dp size({dp_size})" 43 | 44 | 45 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 46 | torch.cuda.set_device(local_rank) 47 | 48 | device = torch.device(f"cuda:{local_rank}") 49 | 50 | device_mesh = init_device_mesh("cuda", mesh_shape=(pp_size, dp_size, tp_size), mesh_dim_names=("pp", "dp", "tp")) 51 | tp_group = device_mesh["tp"].get_group() 52 | dp_group = device_mesh["dp"].get_group() 53 | pp_group = device_mesh["pp"].get_group() 54 | tp_mesh = device_mesh["tp"] 55 | dp_mesh = device_mesh["dp"] 56 | pp_mesh = device_mesh["pp"] 57 | 58 | 59 | print(f"[{rank}] >>> pp group:{pp_mesh}, dp_group:{dp_mesh}, tp_group:{tp_mesh}") 60 | 61 | time.sleep(2) 62 | 63 | print(f"[rank:{rank}, run completed ...") 64 | -------------------------------------------------------------------------------- /gpt-neox/.dockerignore: -------------------------------------------------------------------------------- 1 | 20B_checkpoints/ 2 | -------------------------------------------------------------------------------- /gpt-neox/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-json 7 | - id: check-symlinks 8 | - id: check-yaml 9 | - id: destroyed-symlinks 10 | - id: end-of-file-fixer 11 | exclude: docs/CNAME 12 | - id: fix-byte-order-marker 13 | - id: fix-encoding-pragma 14 | args: [--remove] 15 | - id: mixed-line-ending 16 | args: [--fix=lf] 17 | - id: requirements-txt-fixer 18 | - id: trailing-whitespace 19 | - repo: https://gitlab.com/daverona/pre-commit-cpp 20 | rev: 0.8.0 21 | hooks: 22 | - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available 23 | args: [] 24 | 25 | - repo: https://github.com/psf/black 26 | rev: 22.3.0 27 | hooks: 28 | - id: black 29 | language_version: python3 30 | - repo: https://github.com/codespell-project/codespell 31 | rev: v2.1.0 32 | hooks: 33 | - id: codespell 34 | args: [ 35 | '--ignore-words-list=reord,dout', # Word used in error messages that need rewording 36 | --check-filenames, 37 | --check-hidden, 38 | ] 39 | -------------------------------------------------------------------------------- /gpt-neox/CITATION.cff: -------------------------------------------------------------------------------- 1 | # YAML 1.2 2 | --- 3 | authors: 4 | - affiliation: EleutherAI 5 | family-names: Andonian 6 | given-names: Alex 7 | - affiliation: EleutherAI 8 | family-names: Anthony 9 | given-names: Quentin 10 | - affiliation: EleutherAI 11 | family-names: Biderman 12 | given-names: Stella 13 | - affiliation: EleutherAI 14 | family-names: Black 15 | given-names: Sid 16 | - affiliation: EleutherAI 17 | family-names: Gali 18 | given-names: Preetham 19 | - affiliation: EleutherAI 20 | family-names: Gao 21 | given-names: Leo 22 | - affiliation: EleutherAI 23 | family-names: Hallahan 24 | given-names: Eric 25 | - affiliation: EleutherAI 26 | family-names: Levy-Kramer 27 | given-names: Josh 28 | - affiliation: EleutherAI 29 | family-names: Leahy 30 | given-names: Connor 31 | - affiliation: EleutherAI 32 | family-names: Nestler 33 | given-names: Lucas 34 | - affiliation: EleutherAI 35 | family-names: Parker 36 | given-names: Kip 37 | - affiliation: EleutherAI 38 | family-names: Pieler 39 | given-names: Michael 40 | - affiliation: EleutherAI 41 | family-names: Phang 42 | given-names: Jason 43 | - affiliation: EleutherAI 44 | family-names: Purohit 45 | given-names: Shivanshu 46 | - affiliation: EleutherAI 47 | family-names: Schoelkopf 48 | given-names: Hailey 49 | - affiliation: EleutherAI 50 | family-names: Stander 51 | given-names: Dashiell 52 | - affiliation: EleutherAI 53 | family-names: Songz 54 | given-names: Tri 55 | - affiliation: EleutherAI 56 | family-names: Tigges 57 | given-names: Curt 58 | - affiliation: EleutherAI 59 | family-names: Thérien 60 | given-names: Benjamin 61 | - affiliation: EleutherAI 62 | family-names: Wang 63 | given-names: Phil 64 | - affiliation: EleutherAI 65 | family-names: Weinbach 66 | given-names: Samuel 67 | cff-version: "1.1.0" 68 | keywords: 69 | - "Transformers" 70 | - "Massive language model" 71 | - "Autoregressive language model" 72 | license: "Apache-2.0" 73 | message: "If you use this software, please cite it using these metadata." 74 | repository-code: "https://www.github.com/eleutherai/gpt-neox" 75 | title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch" 76 | version: "2.0.0" 77 | doi: "10.5281/zenodo.5879544" 78 | date-released: 2021-08-23 79 | ... 80 | -------------------------------------------------------------------------------- /gpt-neox/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /gpt-neox/README-MUP.md: -------------------------------------------------------------------------------- 1 | # How to use Mup (https://github.com/microsoft/mup) 2 | 3 | ## Add mup neox args to your config 4 | 5 | ``` 6 | # mup 7 | 8 | "use-mup": true, 9 | 10 | "save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank 11 | 12 | "base-shapes-file": "base-shapes", # load base shapes from this file 13 | 14 | "coord-check": false, # generate coord check plots to verify mup's implementation in neox 15 | 16 | # mup hp search 17 | 18 | "mup-init-scale": 1.0, 19 | 20 | "mup-attn-temp": 1.0, 21 | 22 | "mup-output-temp": 1.0, 23 | 24 | "mup-embedding-mult": 1.0, 25 | 26 | "mup-rp-embedding-mult": 1.0, 27 | ``` 28 | 29 | ## Generate base shapes 30 | 31 | 1. Set use-mup to true 32 | 2. Set save-base-shapes to true 33 | 3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named .. gpt-neox will exit immediately. 34 | 4. Set save-base-shapes to false 35 | 36 | ## Generate coord check plots (optional) 37 | 38 | 1. Keep use-mup true 39 | 2. Set coord-check to true 40 | 3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately 41 | 4. Set coord-check to false 42 | 43 | ## Tune mup hyperparameters and LR 44 | 45 | The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml). 46 | 47 | ## Transfer 48 | 49 | With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again. 50 | -------------------------------------------------------------------------------- /gpt-neox/configs/1.3B-32k-len-conf.yml: -------------------------------------------------------------------------------- 1 | { 2 | "model_parallel_size": 1, 3 | "make_vocab_size_divisible_by": 1, 4 | 5 | "num_layers": 24, 6 | "hidden_size": 2048, 7 | "num_attention_heads": 16, 8 | "seq_length": 32768, 9 | "max_position_embeddings": 32768, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 1, 12 | "rotary_emb_base": 10000, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": false, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], all]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": false, 21 | "use_bias_in_norms": false, 22 | "use_bias_in_attn_linear": false, 23 | 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "adam", 29 | "params": { 30 | "lr": 0.0001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-6, 33 | } 34 | }, 35 | "min_lr": 0.00001, 36 | 37 | "zero_optimization": { 38 | "stage": 0, 39 | "offload_param": { 40 | "device": "cpu" 41 | }, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | # "train_batch_size": 128, 51 | # "train_batch_size": 32, 52 | "gradient_accumulation_steps": 64, 53 | "split": "960,35,5", 54 | "train_micro_batch_size_per_gpu": 1, 55 | "data_impl": "mmap", 56 | 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train_iters": 20, 77 | "lr_decay_iters": 20, 78 | "distributed_backend": "nccl", 79 | "lr_decay_style": "cosine", 80 | "warmup": 0.05, 81 | "checkpoint_factor": 450, 82 | "eval_interval": 1800, 83 | "eval_iters": 10, 84 | 85 | "log_interval": 10, 86 | "steps_per_print": 1, 87 | "keep_last_n_checkpoints": 10, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /gpt-neox/configs/125M-32k-len-conf.yml: -------------------------------------------------------------------------------- 1 | { 2 | "model_parallel_size": 1, 3 | "make_vocab_size_divisible_by": 1, 4 | 5 | "num_layers": 12, 6 | "hidden_size": 768, 7 | "num_attention_heads": 12, 8 | "seq_length": 32768, 9 | "max_position_embeddings": 32768, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 1, 12 | "rotary_emb_base": 10000, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": false, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], all]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": false, 21 | "use_bias_in_norms": false, 22 | "use_bias_in_attn_linear": false, 23 | 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "adam", 29 | "params": { 30 | "lr": 0.0001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-6, 33 | } 34 | }, 35 | "min_lr": 0.00001, 36 | 37 | "zero_optimization": { 38 | "stage": 3, 39 | "offload_param": { 40 | "device": "cpu" 41 | }, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | "train_batch_size": 128, 51 | # "train_batch_size": 32, 52 | "gradient_accumulation_steps": 8, 53 | "split": "960,35,5", 54 | "train_micro_batch_size_per_gpu": 2, 55 | "data_impl": "mmap", 56 | 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train_iters": 200, 77 | "lr_decay_iters": 200, 78 | "distributed_backend": "nccl", 79 | "lr_decay_style": "cosine", 80 | "warmup": 0.05, 81 | "checkpoint_factor": 450, 82 | "eval_interval": 1800, 83 | "eval_iters": 10, 84 | 85 | "log_interval": 10, 86 | "steps_per_print": 1, 87 | "keep_last_n_checkpoints": 10, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /gpt-neox/configs/2.7B-32k-len-conf.yml: -------------------------------------------------------------------------------- 1 | { 2 | "model_parallel_size": 1, 3 | "make_vocab_size_divisible_by": 1, 4 | 5 | "num_layers": 32, 6 | "hidden_size": 2560, 7 | "num_attention_heads": 32, 8 | "seq_length": 32768, 9 | "max_position_embeddings": 32768, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 1, 12 | "rotary_emb_base": 10000, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": false, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], all]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": false, 21 | "use_bias_in_norms": false, 22 | "use_bias_in_attn_linear": false, 23 | 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "adam", 29 | "params": { 30 | "lr": 0.0001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-6, 33 | } 34 | }, 35 | "min_lr": 0.00001, 36 | 37 | "zero_optimization": { 38 | "stage": 3, 39 | "offload_param": { 40 | "device": "cpu" 41 | }, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | "train_batch_size": 128, 51 | # "train_batch_size": 32, 52 | "gradient_accumulation_steps": 8, 53 | "split": "960,35,5", 54 | "train_micro_batch_size_per_gpu": 2, 55 | "data_impl": "mmap", 56 | 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train_iters": 200, 77 | "lr_decay_iters": 200, 78 | "distributed_backend": "nccl", 79 | "lr_decay_style": "cosine", 80 | "warmup": 0.05, 81 | "checkpoint_factor": 450, 82 | "eval_interval": 1800, 83 | "eval_iters": 10, 84 | 85 | "log_interval": 10, 86 | "steps_per_print": 1, 87 | "keep_last_n_checkpoints": 10, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /gpt-neox/configs/250M-32k-len-conf.yml: -------------------------------------------------------------------------------- 1 | { 2 | "model_parallel_size": 1, 3 | "make_vocab_size_divisible_by": 1, 4 | 5 | "num_layers": 12, 6 | "hidden_size": 1024, 7 | "num_attention_heads": 16, 8 | "seq_length": 32768, 9 | "max_position_embeddings": 32768, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 1, 12 | "rotary_emb_base": 10000, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": false, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], all]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": false, 21 | "use_bias_in_norms": false, 22 | "use_bias_in_attn_linear": false, 23 | 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "adam", 29 | "params": { 30 | "lr": 0.0001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-6, 33 | } 34 | }, 35 | "min_lr": 0.00001, 36 | 37 | "zero_optimization": { 38 | "stage": 3, 39 | "offload_param": { 40 | "device": "cpu" 41 | }, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | "train_batch_size": 128, 51 | # "train_batch_size": 32, 52 | "gradient_accumulation_steps": 8, 53 | "split": "960,35,5", 54 | "train_micro_batch_size_per_gpu": 2, 55 | "data_impl": "mmap", 56 | 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train_iters": 200, 77 | "lr_decay_iters": 200, 78 | "distributed_backend": "nccl", 79 | "lr_decay_style": "cosine", 80 | "warmup": 0.05, 81 | "checkpoint_factor": 450, 82 | "eval_interval": 1800, 83 | "eval_iters": 10, 84 | 85 | "log_interval": 10, 86 | "steps_per_print": 1, 87 | "keep_last_n_checkpoints": 10, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /gpt-neox/configs/6.7B-32k-len-conf.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 4, 3 | "model_parallel_size": 8, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | "num_layers": 32, 7 | "hidden_size": 4096, 8 | "num_attention_heads": 32, 9 | "seq_length": 32768, 10 | "max_position_embeddings": 32768, 11 | "pos_emb": "rotary", 12 | "rotary_pct": 1, 13 | "rotary_emb_base": 10000, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | 18 | "attention_config": [[["flash"], 32]], 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | 25 | "init_method": "small_init", 26 | "output_layer_init_method": "wang_init", 27 | 28 | "optimizer": { 29 | "type": "adam", 30 | "params": { 31 | "lr": 0.0001, 32 | "betas": [0.9, 0.95], 33 | "eps": 1.0e-6, 34 | } 35 | }, 36 | "min_lr": 0.00001, 37 | 38 | "zero_optimization": { 39 | "stage": 1, 40 | "offload_param": { 41 | "device": "cpu" 42 | }, 43 | "allgather_partitions": True, 44 | "allgather_bucket_size": 500000000, 45 | "overlap_comm": True, 46 | "reduce_scatter": True, 47 | "reduce_bucket_size": 500000000, 48 | "contiguous_gradients": True, 49 | }, 50 | 51 | # "train_batch_size": 128, 52 | # "train_batch_size": 32, 53 | "gradient_accumulation_steps": 32, 54 | "split": "960,35,5", 55 | "train_micro_batch_size_per_gpu": 4, 56 | "data_impl": "mmap", 57 | 58 | "checkpoint_activations": true, 59 | "checkpoint_num_layers": 1, 60 | "partition_activations": true, 61 | "synchronize_each_layer": true, 62 | 63 | "gradient_clipping": 1.0, 64 | "weight_decay": 0.1, 65 | "hidden_dropout": 0, 66 | "attention_dropout": 0, 67 | 68 | "fp16": { 69 | "fp16": true, 70 | "enabled": true, 71 | "loss_scale": 0, 72 | "loss_scale_window": 1000, 73 | "hysteresis": 2, 74 | "min_loss_scale": 1 75 | }, 76 | 77 | "train_iters": 200, 78 | "lr_decay_iters": 200, 79 | "distributed_backend": "nccl", 80 | "lr_decay_style": "cosine", 81 | "warmup": 0.05, 82 | "checkpoint_factor": 450, 83 | "eval_interval": 1800, 84 | "eval_iters": 10, 85 | 86 | "log_interval": 10, 87 | "steps_per_print": 1, 88 | "keep_last_n_checkpoints": 10, 89 | "wall_clock_breakdown": true, 90 | } 91 | -------------------------------------------------------------------------------- /gpt-neox/configs/760M-32k-len-conf.yml: -------------------------------------------------------------------------------- 1 | { 2 | "model_parallel_size": 1, 3 | "make_vocab_size_divisible_by": 1, 4 | 5 | "num_layers": 24, 6 | "hidden_size": 1536, 7 | "num_attention_heads": 16, 8 | "seq_length": 32768, 9 | "max_position_embeddings": 32768, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 1, 12 | "rotary_emb_base": 10000, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": false, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], all]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": false, 21 | "use_bias_in_norms": false, 22 | "use_bias_in_attn_linear": false, 23 | 24 | "init_method": "small_init", 25 | "output_layer_init_method": "wang_init", 26 | 27 | "optimizer": { 28 | "type": "adam", 29 | "params": { 30 | "lr": 0.0001, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-6, 33 | } 34 | }, 35 | "min_lr": 0.00001, 36 | 37 | "zero_optimization": { 38 | "stage": 3, 39 | "offload_param": { 40 | "device": "cpu" 41 | }, 42 | "allgather_partitions": True, 43 | "allgather_bucket_size": 500000000, 44 | "overlap_comm": True, 45 | "reduce_scatter": True, 46 | "reduce_bucket_size": 500000000, 47 | "contiguous_gradients": True, 48 | }, 49 | 50 | # "train_batch_size": 128, 51 | # "train_batch_size": 32, 52 | "gradient_accumulation_steps": 64, 53 | "split": "960,35,5", 54 | "train_micro_batch_size_per_gpu": 2, 55 | "data_impl": "mmap", 56 | 57 | "checkpoint_activations": true, 58 | "checkpoint_num_layers": 1, 59 | "partition_activations": true, 60 | "synchronize_each_layer": true, 61 | 62 | "gradient_clipping": 1.0, 63 | "weight_decay": 0.1, 64 | "hidden_dropout": 0, 65 | "attention_dropout": 0, 66 | 67 | "fp16": { 68 | "fp16": true, 69 | "enabled": true, 70 | "loss_scale": 0, 71 | "loss_scale_window": 1000, 72 | "hysteresis": 2, 73 | "min_loss_scale": 1 74 | }, 75 | 76 | "train_iters": 10, 77 | "lr_decay_iters": 10, 78 | "distributed_backend": "nccl", 79 | "lr_decay_style": "cosine", 80 | "warmup": 0.05, 81 | "checkpoint_factor": 450, 82 | "eval_interval": 1800, 83 | "eval_iters": 10, 84 | 85 | "log_interval": 5, 86 | "steps_per_print": 1, 87 | "keep_last_n_checkpoints": 10, 88 | "wall_clock_breakdown": true, 89 | } 90 | -------------------------------------------------------------------------------- /gpt-neox/configs/autotuning_configs/small_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | "num-layers": 12, 6 | "hidden-size": 768, 7 | "num-attention-heads": 12, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | 14 | "scaled-upper-triang-masked-softmax-fusion": false, 15 | "bias-gelu-fusion": false, 16 | 17 | 18 | "optimizer": { 19 | "type": "Adam", 20 | "params": { 21 | "lr": 0.0006, 22 | "betas": [0.9, 0.999], 23 | "eps": 1.0e-8 24 | } 25 | }, 26 | 27 | "train_micro_batch_size_per_gpu": 1, 28 | "data-impl": "mmap", 29 | "split": "949,50,1", 30 | 31 | "checkpoint-activations": true, 32 | "checkpoint-num-layers": 1, 33 | "partition-activations": true, 34 | "synchronize-each-layer": true, 35 | 36 | "gradient_clipping": 1.0, 37 | "weight-decay": 0.0, 38 | "hidden-dropout": 0.0, 39 | "attention-dropout": 0.0, 40 | 41 | "fp16": { 42 | "enabled": true, 43 | "loss_scale": 0, 44 | "loss_scale_window": 1000, 45 | "hysteresis": 2, 46 | "min_loss_scale": 1 47 | }, 48 | 49 | "train-iters": 320000, 50 | "lr-decay-iters": 320000, 51 | "distributed-backend": "nccl", 52 | "lr-decay-style": "cosine", 53 | "warmup": 0.01, 54 | "save-interval": 10000, 55 | "eval-interval": 1000, 56 | "eval-iters": 10, 57 | 58 | "log-interval": 100, 59 | "steps_per_print": 10, 60 | "keep-last-n-checkpoints": 4, 61 | "wall_clock_breakdown": true, 62 | "launcher": "slurm", 63 | "deepspeed_slurm": true, 64 | "comment": "neox", 65 | "autotuning": { 66 | "enabled": true, 67 | "arg_mappings": { 68 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 69 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 70 | } 71 | }, 72 | "zero_optimization": { 73 | "stage": [0, 1, 2, 3] 74 | }, 75 | "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"], 76 | "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"], 77 | "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"] 78 | } 79 | -------------------------------------------------------------------------------- /gpt-neox/configs/autotuning_configs/tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | "num-layers": 12, 5 | "hidden-size": 768, 6 | "num-attention-heads": 12, 7 | "seq-length": 2048, 8 | "max-position-embeddings": 2048, 9 | "norm": "layernorm", 10 | "pos-emb": "rotary", 11 | "no-weight-tying": true, 12 | "scaled-upper-triang-masked-softmax-fusion": true, 13 | "bias-gelu-fusion": true, 14 | "optimizer": { 15 | "type": "Adam", 16 | "params": { 17 | "lr": 0.0006, 18 | "betas": [0.9, 0.999], 19 | "eps": 1.0e-8 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 0, 24 | "allgather_partitions": true, 25 | "allgather_bucket_size": 500000000, 26 | "overlap_comm": true, 27 | "reduce_scatter": true, 28 | "reduce_bucket_size": 500000000, 29 | "contiguous_gradients": true, 30 | "cpu_offload": false 31 | }, 32 | "train_micro_batch_size_per_gpu": 1, 33 | "autotuning_config": { 34 | "enabled": true, 35 | "arg_mappings": { 36 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 37 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 38 | } 39 | }, 40 | "data-impl": "mmap", 41 | "split": "949,50,1", 42 | "checkpoint-activations": true, 43 | "checkpoint-num-layers": 1, 44 | "partition-activations": true, 45 | "synchronize-each-layer": true, 46 | "gradient_clipping": 1.0, 47 | "weight-decay": 0.0, 48 | "hidden-dropout": 0.0, 49 | "attention-dropout": 0.0, 50 | "fp16": { 51 | "enabled": true, 52 | "loss_scale": 0, 53 | "loss_scale_window": 1000, 54 | "hysteresis": 2, 55 | "min_loss_scale": 1 56 | }, 57 | "train-iters": 200, 58 | "lr-decay-iters": 320000, 59 | "distributed-backend": "nccl", 60 | "lr-decay-style": "cosine", 61 | "warmup": 0.01, 62 | "save-interval": 10000, 63 | "eval-interval": 1000, 64 | "eval-iters": 10, 65 | "log-interval": 100, 66 | "steps_per_print": 10, 67 | "keep-last-n-checkpoints": 4, 68 | "wall_clock_breakdown": true, 69 | "launcher": "slurm", 70 | "deepspeed_slurm": true, 71 | "comment": "neox" 72 | } 73 | -------------------------------------------------------------------------------- /gpt-neox/configs/autotuning_configs/tune_1-3B.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 1, 4 | 5 | "num-layers": 24, 6 | "hidden-size": 2048, 7 | "num-attention-heads": 16, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | "attention_config": [[["flash"], 24]], 16 | "scaled-upper-triang-masked-softmax-fusion": false, 17 | "bias-gelu-fusion": false, 18 | 19 | "init_method": "small_init", 20 | "output_layer_init_method": "wang_init", 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0002, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.00002, 31 | 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": true 40 | }, 41 | "train_micro_batch_size_per_gpu": 1, 42 | "autotuning": { 43 | "enabled": true, 44 | "arg_mappings": { 45 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 46 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 47 | } 48 | }, 49 | "data-impl": "mmap", 50 | 51 | "checkpoint-activations": false, 52 | "checkpoint-num-layers": 1, 53 | "partition-activations": true, 54 | "synchronize-each-layer": true, 55 | 56 | "gradient_clipping": 1.0, 57 | "weight-decay": 0.1, 58 | "hidden-dropout": 0, 59 | "attention-dropout": 0, 60 | 61 | "fp16": { 62 | "fp16": true, 63 | "enabled": true, 64 | "loss_scale": 0, 65 | "loss_scale_window": 1000, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train-iters": 320000, 71 | "lr-decay-iters": 320000, 72 | "distributed-backend": "nccl", 73 | "lr-decay-style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint-factor": 10000, 76 | "eval-interval": 1000, 77 | "eval-iters": 10, 78 | "launcher": "slurm", 79 | "deepspeed_slurm": true, 80 | "no_ssh_check": true, 81 | 82 | "log-interval": 10, 83 | "steps_per_print": 10, 84 | "keep-last-n-checkpoints": 1, 85 | "wall_clock_breakdown": true 86 | } 87 | -------------------------------------------------------------------------------- /gpt-neox/configs/autotuning_configs/tune_6-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipe-parallel-size": 1, 3 | "model-parallel-size": 8, 4 | 5 | "num-layers": 32, 6 | "hidden-size": 4096, 7 | "num-attention-heads": 32, 8 | "seq-length": 2048, 9 | "max-position-embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos-emb": "rotary", 12 | "no-weight-tying": true, 13 | 14 | "scaled-upper-triang-masked-softmax-fusion": false, 15 | "bias-gelu-fusion": false, 16 | 17 | 18 | "optimizer": { 19 | "type": "Adam", 20 | "params": { 21 | "lr": 0.00012, 22 | "betas": [0.9, 0.999], 23 | "eps": 1.0e-8 24 | } 25 | }, 26 | 27 | "train_micro_batch_size_per_gpu": 1, 28 | "zero_optimization": { 29 | "stage": [0, 1, 2, 3] 30 | }, 31 | "data-impl": "mmap", 32 | "split": "949,50,1", 33 | 34 | "checkpoint-activations": true, 35 | "checkpoint-num-layers": 1, 36 | "partition-activations": true, 37 | "synchronize-each-layer": true, 38 | 39 | "gradient_clipping": 1.0, 40 | "weight-decay": 0, 41 | "hidden-dropout": 0, 42 | "attention-dropout": 0, 43 | 44 | "fp16": { 45 | "fp16": true, 46 | "enabled": true, 47 | "loss_scale": 0, 48 | "loss_scale_window": 1000, 49 | "hysteresis": 2, 50 | "min_loss_scale": 1 51 | }, 52 | 53 | "train-iters": 100, 54 | "lr-decay-iters": 320000, 55 | "distributed-backend": "nccl", 56 | "lr-decay-style": "cosine", 57 | "warmup": 0.01, 58 | "checkpoint-factor": 10000, 59 | "eval-interval": 1000, 60 | "eval-iters": 10, 61 | "log-interval": 100, 62 | "steps_per_print": 10, 63 | "keep-last-n-checkpoints": 4, 64 | "wall_clock_breakdown": true, 65 | "launcher": "slurm", 66 | "deepspeed_slurm": true, 67 | "no_ssh_check": true, 68 | "comment": "neox", 69 | "autotuning": { 70 | "enabled": true, 71 | "mp_size": 8, 72 | "arg_mappings": { 73 | "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu", 74 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /gpt-neox/configs/enwik8.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | #"data_path": "/var/nfs/data/enwik8/enwik8_text_document", 4 | #"data_path": "/data/enwik8/enwik8/enwik8_text_document", 5 | "data_path": "./dataset/enwik8/enwik8/enwik8_text_document", 6 | 7 | # or for weighted datasets: 8 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 9 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 10 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 11 | # "train-data-weights": [1., 2.], 12 | # "test-data-weights": [2., 1.], 13 | # "valid-data-weights": [0.5, 0.4], 14 | 15 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 16 | # WARNING: setting this to True will override any user provided weights 17 | # "weight_by_num_documents": false, 18 | # "weighted_sampler_alpha": 0.3, 19 | 20 | "vocab_file": "./dataset/enwik8/gpt2-vocab.json", 21 | "merge_file": "./dataset/enwik8/gpt2-merges.txt", 22 | 23 | # "save": "checkpoints", 24 | # "load": "checkpoints", 25 | "checkpoint_validation_with_forward_pass": False, 26 | 27 | "tensorboard_dir": "tensorboard", 28 | "log_dir": "./logs", 29 | "use_wandb": True, 30 | "wandb_host": "https://api.wandb.ai", 31 | "wandb_project": "neox" 32 | 33 | } 34 | -------------------------------------------------------------------------------- /gpt-neox/configs/etri_cluster.yml: -------------------------------------------------------------------------------- 1 | # Configurations for using ETRI GPU cluster 2 | { 3 | "launcher": "pdsh", 4 | #"launcher": "openmpi", 5 | #"deepspeed_mpi": true, 6 | #"gradient_accumulation_steps": 8, 7 | #"global_num_gpus": 16, 8 | "hostfile": "./hostfile", 9 | } 10 | -------------------------------------------------------------------------------- /gpt-neox/configs/finetuning_configs/6-9B.yml: -------------------------------------------------------------------------------- 1 | { 2 | # finetuning option 3 | "load": "/path/to/checkpoint", 4 | "finetune": true, 5 | 6 | "pipe-parallel-size": 1, 7 | "model-parallel-size": 2, 8 | 9 | "num-layers": 32, 10 | "hidden-size": 4096, 11 | "num-attention-heads": 32, 12 | "seq-length": 2048, 13 | "max-position-embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos-emb": "rotary", 16 | "rotary_pct": 0.25, 17 | "no-weight-tying": true, 18 | "gpt_j_residual": true, 19 | "output_layer_parallelism": "column", 20 | 21 | "attention-config": [[["flash"], 32]], 22 | 23 | "scaled-upper-triang-masked-softmax-fusion": true, 24 | "bias-gelu-fusion": true, 25 | 26 | 27 | "optimizer": { 28 | "type": "Adam", 29 | "params": { 30 | "lr": 0.00012, 31 | "betas": [0.9, 0.95], 32 | "eps": 1.0e-8 33 | } 34 | }, 35 | 36 | "min_lr": 0.000012, 37 | 38 | "zero_optimization": { 39 | "stage": 1, 40 | "allgather_partitions": true, 41 | "allgather_bucket_size": 1260000000, 42 | "overlap_comm": true, 43 | "reduce_scatter": true, 44 | "reduce_bucket_size": 1260000000, 45 | "contiguous_gradients": true, 46 | "cpu_offload": false 47 | "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params 48 | }, 49 | 50 | "train_micro_batch_size_per_gpu": 8, 51 | "gradient_accumulation_steps": 2, 52 | "data-impl": "mmap", 53 | 54 | "checkpoint-activations": true, 55 | "checkpoint-num-layers": 1, 56 | "partition-activations": true, 57 | "synchronize-each-layer": true, 58 | 59 | "gradient_clipping": 1.0, 60 | "weight-decay": 0.1, 61 | "hidden-dropout": 0, 62 | "attention-dropout": 0, 63 | 64 | "fp16": { 65 | "fp16": true, 66 | "enabled": true, 67 | "loss_scale": 0, 68 | "loss_scale_window": 1000, 69 | "initial_scale_power": 12, 70 | "hysteresis": 2, 71 | "min_loss_scale": 1 72 | }, 73 | 74 | "train-iters": 143000, 75 | "lr-decay-iters": 143000, 76 | "distributed-backend": "nccl", 77 | "lr-decay-style": "cosine", 78 | "warmup": 0.01, 79 | "checkpoint-factor": 1000, 80 | "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512], 81 | "eval-interval": 143000, 82 | "eval-iters": 10, 83 | 84 | "log-interval": 10, 85 | "steps_per_print": 10, 86 | "wall_clock_breakdown": true, 87 | 88 | "tokenizer_type": "HFTokenizer" 89 | } 90 | -------------------------------------------------------------------------------- /gpt-neox/configs/llama/13B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 2, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 40, 8 | "hidden_size": 5120, 9 | "num_attention_heads": 40, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "mlp_type": "llama", 25 | "activation": "silu", 26 | } 27 | -------------------------------------------------------------------------------- /gpt-neox/configs/llama/30B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 4, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 60, 8 | "hidden_size": 6656, 9 | "num_attention_heads": 52, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "mlp_type": "llama", 25 | "activation": "silu", 26 | } 27 | -------------------------------------------------------------------------------- /gpt-neox/configs/llama/65B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 8, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 80, 8 | "hidden_size": 8192, 9 | "num_attention_heads": 64, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "mlp_type": "llama", 25 | "activation": "silu", 26 | } 27 | -------------------------------------------------------------------------------- /gpt-neox/configs/llama/7B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "make_vocab_size_divisible_by": 1, 5 | 6 | # model settings 7 | "num_layers": 32, 8 | "hidden_size": 4096, 9 | "num_attention_heads": 32, 10 | "seq_length": 2048, 11 | "max_position_embeddings": 2048, 12 | "pos_emb": "rotary", 13 | "rotary_pct": 1, 14 | "no_weight_tying": true, 15 | "gpt_j_residual": false, 16 | "output_layer_parallelism": "column", 17 | "norm": "rmsnorm", 18 | "rms_norm_epsilon": 1.0e-6, 19 | 20 | "scaled_upper_triang_masked_softmax_fusion": true, 21 | "bias_gelu_fusion": false, 22 | "use_bias_in_norms": false, 23 | "use_bias_in_attn_linear": false, 24 | "mlp_type": "llama", 25 | "activation": "silu", 26 | } 27 | -------------------------------------------------------------------------------- /gpt-neox/configs/llama/README.md: -------------------------------------------------------------------------------- 1 | # LLaMA 2 | 3 | ## Training and Finetuning 4 | 5 | These configs contain the architecture settings required to run inference/training/finetuning on the [LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama) model suite. 6 | 7 | LLaMA finetuning can be launched with 8 | ```sh 9 | python ./deepy.py ./train.py -d configs llama/7B.yml llama/train_config.yml local_setup.yml 10 | ``` 11 | 12 | If training from scratch, set `finetune=False` in `./configs/llama/train_config.yml`. 13 | 14 | 15 | ## Inference 16 | 17 | 18 | LLaMA generation can be launched with 19 | ```sh 20 | python ./deepy.py ./generate.py -d configs \ 21 | llama/7B.yml llama/train_config.yml local_setup.yml text_generation.yml \ 22 | -i input_prompt.txt -o prompt_out.txt 23 | ``` 24 | -------------------------------------------------------------------------------- /gpt-neox/configs/llama/train_config.yml: -------------------------------------------------------------------------------- 1 | { 2 | # finetuning option 3 | "finetune": true, 4 | 5 | # init methods 6 | "init_method": "small_init", 7 | "output_layer_init_method": "wang_init", 8 | 9 | # optimizer settings 10 | "optimizer": { 11 | "type": "Adam", 12 | "params": { 13 | "lr": 0.0002, 14 | "betas": [0.9, 0.95], 15 | "eps": 1.0e-8, 16 | } 17 | }, 18 | "min_lr": 0.00002, 19 | "override_lr_scheduler": true, 20 | 21 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 22 | "zero_optimization": { 23 | "stage": 1, 24 | "allgather_partitions": True, 25 | "allgather_bucket_size": 500000000, 26 | "overlap_comm": True, 27 | "reduce_scatter": True, 28 | "reduce_bucket_size": 500000000, 29 | "contiguous_gradients": True, 30 | }, 31 | 32 | # batch / data settings 33 | "train_micro_batch_size_per_gpu": 4, 34 | "data_impl": "mmap", 35 | 36 | # activation checkpointing 37 | "checkpoint_activations": true, 38 | "checkpoint_num_layers": 1, 39 | "partition_activations": true, 40 | "synchronize_each_layer": true, 41 | 42 | # regularization 43 | "gradient_clipping": 1.0, 44 | "weight_decay": 0.1, 45 | "hidden_dropout": 0, 46 | "attention_dropout": 0, 47 | 48 | # precision settings 49 | "fp16": { 50 | "fp16": true, 51 | "enabled": true, 52 | "loss_scale": 0, 53 | "loss_scale_window": 1000, 54 | "hysteresis": 2, 55 | "min_loss_scale": 1 56 | }, 57 | 58 | # misc. training settings 59 | "train_iters": 320000, 60 | "lr_decay_iters": 320000, 61 | "distributed_backend": "nccl", 62 | "lr_decay_style": "cosine", 63 | "warmup": 0.01, 64 | "checkpoint_factor": 10000, 65 | "eval_interval": 1000, 66 | "eval_iters": 10, 67 | 68 | # logging 69 | "log_interval": 100, 70 | "steps_per_print": 10, 71 | "keep_last_n_checkpoints": 4, 72 | "wall_clock_breakdown": true, 73 | } 74 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/125M-json.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 12, 6 | "hidden_size": 768, 7 | "num_attention_heads": 12, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos_emb": "rotary", 12 | "no_weight_tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": false, 17 | "bias_gelu_fusion": false, 18 | 19 | "init_method": "small_init", 20 | "output_layer_init_method": "wang_init", 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0006, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.00006, 31 | 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": true 40 | }, 41 | 42 | "train_micro_batch_size_per_gpu": 4, 43 | "data_impl": "mmap", 44 | 45 | "checkpoint_activations": true, 46 | "checkpoint_num_layers": 1, 47 | "partition_activations": true, 48 | "synchronize_each_layer": true, 49 | 50 | "gradient_clipping": 1.0, 51 | "weight_decay": 0.1, 52 | "hidden_dropout": 0.0, 53 | "attention_dropout": 0.0, 54 | 55 | "fp16": { 56 | "enabled": true, 57 | "loss_scale": 0, 58 | "loss_scale_window": 1000, 59 | "hysteresis": 2, 60 | "min_loss_scale": 1 61 | }, 62 | 63 | "train_iters": 320000, 64 | "lr_decay_iters": 320000, 65 | "distributed_backend": "nccl", 66 | "lr_decay_style": "cosine", 67 | "warmup": 0.01, 68 | "checkpoint_factor": 10000, 69 | "eval_interval": 1000, 70 | "eval_iters": 10, 71 | 72 | "log_interval": 100, 73 | "steps_per_print": 10, 74 | "keep_last_n_checkpoints": 4, 75 | "wall_clock_breakdown": true, 76 | 77 | "hostfile": "/mock_path" 78 | } 79 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/800M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | # model settings 6 | "num_layers": 16, 7 | "hidden_size": 2048, 8 | "num_attention_heads": 8, 9 | "seq_length": 2048, 10 | "max_position_embeddings": 2048, 11 | "pos_emb": "rotary", 12 | "no_weight_tying": true, 13 | "gpt_j_residual": false, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": false, 17 | "bias_gelu_fusion": false, 18 | 19 | # init methods 20 | "init_method": "small_init", 21 | "output_layer_init_method": "wang_init", 22 | 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.00025, 27 | "betas": [0.9, 0.95], 28 | "eps": 1.0e-8, 29 | } 30 | }, 31 | "min_lr": 0.000025, 32 | 33 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": True, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": True, 39 | "reduce_scatter": True, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": True, 42 | }, 43 | 44 | "train_micro_batch_size_per_gpu": 16, 45 | "gas": 1, 46 | "data_impl": "mmap", 47 | "num_workers": 1, 48 | 49 | # activation checkpointing 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | # regularization 56 | "gradient_clipping": 1.0, 57 | "weight_decay": 0.1, 58 | "hidden_dropout": 0, 59 | "attention_dropout": 0, 60 | 61 | # precision settings 62 | "fp16": { 63 | "fp16": true, 64 | "enabled": true, 65 | "loss_scale": 0, 66 | "loss_scale_window": 1000, 67 | "initial_scale_power": 12, 68 | "hysteresis": 2, 69 | "min_loss_scale": 1, 70 | }, 71 | 72 | "train_iters": 143000, 73 | "lr_decay_iters": 143000, 74 | "distributed_backend": "nccl", 75 | "lr_decay_style": "cosine", 76 | "warmup": 0.01, 77 | "checkpoint_factor": 1000, 78 | "eval_interval": 40000, 79 | "eval_iters": 10, 80 | 81 | "log_interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | } 85 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/bf16_125M.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | 8 | # model settings 9 | "num_layers": 12, 10 | "hidden_size": 768, 11 | "num_attention_heads": 12, 12 | "seq_length": 2048, 13 | "max_position_embeddings": 2048, 14 | "norm": "layernorm", 15 | "pos_emb": "rotary", 16 | "no_weight_tying": true, 17 | 18 | # these should provide some speedup but takes a while to build, set to true if desired 19 | "scaled_upper_triang_masked_softmax_fusion": false, 20 | "bias_gelu_fusion": false, 21 | 22 | 23 | # optimizer settings 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.999], 29 | "eps": 1.0e-8, 30 | } 31 | }, 32 | # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training 33 | "zero_optimization": { 34 | "stage": 0, 35 | "allgather_partitions": True, 36 | "allgather_bucket_size": 500000000, 37 | "overlap_comm": True, 38 | "reduce_scatter": True, 39 | "reduce_bucket_size": 500000000, 40 | "contiguous_gradients": True, 41 | }, 42 | 43 | # batch / data settings 44 | "train_micro_batch_size_per_gpu": 4, 45 | "data_impl": "mmap", 46 | "split": "949,50,1", 47 | 48 | # activation checkpointing 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | # regularization 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.0, 57 | "hidden_dropout": 0.0, 58 | "attention_dropout": 0.0, 59 | 60 | "precision": "bfloat16", 61 | 62 | "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32 63 | # misc. training settings 64 | "train_iters": 320000, 65 | "lr_decay_iters": 320000, 66 | "distributed_backend": "nccl", 67 | "lr_decay_style": "cosine", 68 | "warmup": 0.01, 69 | "checkpoint_factor": 10000, 70 | "eval_interval": 1000, 71 | "eval_iters": 10, 72 | 73 | # logging 74 | "log_interval": 100, 75 | "steps_per_print": 10, 76 | "keep_last_n_checkpoints": 4, 77 | "wall_clock_breakdown": true, 78 | } 79 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/cpu_mock_config.yml: -------------------------------------------------------------------------------- 1 | # CPU unit tests should be independent of the presence of GPUs on the test server 2 | # host. This configuration mocks these GPU resources and other dependencies. 3 | { 4 | "global_num_gpus": 1 5 | } 6 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/eleutherai_cluster.yml: -------------------------------------------------------------------------------- 1 | # Data paths and options when using EleutherAI cluster 2 | { 3 | # you may include multiple distinct datasets if desired 4 | "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"], 5 | "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"], 6 | "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"], 7 | 8 | # if using multiple datasets, provide weights for them to be sampled with 9 | # "train-data-weights": [1., 2.], 10 | # "test-data-weights": [2., 1.], 11 | # "valid-data-weights": [0.5, 0.4], 12 | 13 | 14 | # If you would like the code to create val and test datasets from your training set use the following instead 15 | # "split" determines the relative size of train, val, and test 16 | 17 | # "split" 995,4,1 18 | # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document", 19 | 20 | "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json", 21 | "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt", 22 | "save": "/mnt/ssd-1/checkpoints", 23 | "load": "/mnt/ssd-1/checkpoints", 24 | "tensorboard_dir": "/mnt/ssd-1/tensorboard", 25 | "log_dir": "/mnt/ssd-1/logs", 26 | "wandb_team": "eleutherai", 27 | "wandb_project": "neox", 28 | "wandb_group": "example" 29 | } 30 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/gmlp_small.yml: -------------------------------------------------------------------------------- 1 | # GPT-2 pretraining setup 2 | { 3 | # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages 4 | # across the node boundaries ) 5 | "pipe_parallel_size": 1, 6 | "model_parallel_size": 1, 7 | "attention_config": [[["gmlp"], "all"]], 8 | 9 | 10 | # model settings 11 | "num_layers": 12, 12 | "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4 13 | "gmlp_attn_dim": 64, 14 | "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention. 15 | "seq_length": 2048, 16 | "max_position_embeddings": 2048, 17 | "norm": "layernorm", 18 | "pos_emb": "none", 19 | "no_weight_tying": true, 20 | 21 | # optimizer settings 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0006, 26 | "betas": [0.9, 0.999], 27 | "eps": 1.0e_8, 28 | } 29 | }, 30 | 31 | # batch / data settings 32 | "train_micro_batch_size_per_gpu": 4, 33 | "data_impl": "mmap", 34 | "split": "949,50,1", 35 | 36 | # activation checkpointing 37 | "checkpoint_activations": true, 38 | "checkpoint_num_layers": 1, 39 | "partition_activations": false, 40 | "synchronize_each_layer": true, 41 | 42 | # regularization 43 | "gradient_clipping": 1.0, 44 | "weight_decay": 0.1, 45 | "hidden_dropout": 0.0, 46 | "attention_dropout": 0.0, 47 | 48 | # precision settings 49 | "fp16": { 50 | "enabled": true, 51 | "loss_scale": 0, 52 | "loss_scale_window": 1000, 53 | "hysteresis": 2, 54 | "min_loss_scale": 1 55 | }, 56 | 57 | # misc. training settings 58 | "train_iters": 320000, 59 | "lr_decay_iters": 320000, 60 | "distributed_backend": "nccl", 61 | "lr_decay_style": "cosine", 62 | "warmup": 0.01, 63 | "checkpoint_factor": 10000, 64 | "eval_interval": 1000, 65 | "eval_iters": 10, 66 | 67 | # logging 68 | "log_interval": 100, 69 | "steps_per_print": 10, 70 | "keep_last_n_checkpoints": 4, 71 | "wall_clock_breakdown": true, 72 | } 73 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/local_setup.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | #"data_path": "/var/nfs/data/enwik8/enwik8_text_document", 4 | "data_path": "/gpt-neox/data/enwik8/enwik8_text_document", 5 | 6 | # or for weighted datasets: 7 | # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 8 | # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 9 | # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], 10 | # "train-data-weights": [1., 2.], 11 | # "test-data-weights": [2., 1.], 12 | # "valid-data-weights": [0.5, 0.4], 13 | 14 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 15 | # WARNING: setting this to True will override any user provided weights 16 | # "weight_by_num_documents": false, 17 | # "weighted_sampler_alpha": 0.3, 18 | 19 | "vocab_file": "/gpt-neox/data/gpt2-vocab.json", 20 | "merge_file": "/gpt-neox/data/gpt2-merges.txt", 21 | 22 | "save": "checkpoints", 23 | "load": "checkpoints", 24 | "checkpoint_validation_with_forward_pass": False, 25 | 26 | "tensorboard_dir": "tensorboard", 27 | "log_dir": "./logs", 28 | "use_wandb": True, 29 | "wandb_host": "https://api.wandb.ai", 30 | "wandb_project": "neox" 31 | } 32 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/slurm_125M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | "num_layers": 12, 5 | "hidden_size": 768, 6 | "num_attention_heads": 12, 7 | "seq_length": 2048, 8 | "max_position_embeddings": 2048, 9 | "norm": "layernorm", 10 | "pos_emb": "rotary", 11 | "no_weight_tying": true, 12 | "scaled_upper_triang_masked_softmax_fusion": true, 13 | "bias_gelu_fusion": true, 14 | "optimizer": { 15 | "type": "Adam", 16 | "params": { 17 | "lr": 0.0006, 18 | "betas": [0.9, 0.999], 19 | "eps": 1.0e-8 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 0, 24 | "allgather_partitions": true, 25 | "allgather_bucket_size": 500000000, 26 | "overlap_comm": true, 27 | "reduce_scatter": true, 28 | "reduce_bucket_size": 500000000, 29 | "contiguous_gradients": true 30 | }, 31 | "train_micro_batch_size_per_gpu": 4, 32 | "data_impl": "mmap", 33 | "split": "949,50,1", 34 | "checkpoint_activations": true, 35 | "checkpoint_num_layers": 1, 36 | "partition_activations": true, 37 | "synchronize_each_layer": true, 38 | "gradient_clipping": 1.0, 39 | "weight_decay": 0.0, 40 | "hidden_dropout": 0.0, 41 | "attention_dropout": 0.0, 42 | "fp16": { 43 | "enabled": true, 44 | "loss_scale": 0, 45 | "loss_scale_window": 1000, 46 | "hysteresis": 2, 47 | "min_loss_scale": 1 48 | }, 49 | "train_iters": 320000, 50 | "lr_decay_iters": 320000, 51 | "distributed_backend": "nccl", 52 | "lr_decay_style": "cosine", 53 | "warmup": 0.01, 54 | "checkpoint_factor": 10000, 55 | "eval_interval": 1000, 56 | "eval_iters": 10, 57 | "log_interval": 100, 58 | "steps_per_print": 10, 59 | "keep_last_n_checkpoints": 4, 60 | "wall_clock_breakdown": true, 61 | "launcher": "slurm", 62 | "deepspeed_slurm": true, 63 | "comment": "neox" 64 | } 65 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/slurm_local.yml: -------------------------------------------------------------------------------- 1 | { 2 | "data_path": "data/enwik8/enwik8_text_document", 3 | "vocab_file": "data/gpt2-vocab.json", 4 | "merge_file": "data/gpt2-merges.txt", 5 | "save": "checkpoints", 6 | "checkpoint_validation_with_forward_pass": false, 7 | "tensorboard_dir": "tensorboard", 8 | "log_dir": "logs", 9 | "use_wandb": true, 10 | "wandb_host": "https://api.wandb.ai", 11 | "wandb_project": "neox" 12 | } 13 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/sparse.yml: -------------------------------------------------------------------------------- 1 | # Add this to your config for sparse attention every other layer 2 | { 3 | "attention_config": [[["local", "global"], "all"]], 4 | 5 | # sparsity config: 6 | # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for 7 | # illustrative purposes) 8 | # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for 9 | # more detailed config instructions and available parameters 10 | 11 | "sparsity_config": { 12 | "block": 16, # block size 13 | "num_local_blocks": 32, 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /gpt-neox/configs/org/text_generation.yml: -------------------------------------------------------------------------------- 1 | # Parameters used for text generation 2 | # Make sure `load` is specified somewhere else 3 | { 4 | # Text gen type: `input-file`, `unconditional` or `interactive` 5 | "text_gen_type": "unconditional", 6 | 7 | # Params for all 8 | "maximum_tokens": 102, 9 | "prompt_end": "\n", 10 | "temperature": 1.0, 11 | "top_p": 0.0, 12 | "top_k": 0, 13 | "recompute": false, 14 | 15 | # `unconditional`: samples 16 | "num_samples": 10, 17 | 18 | # input/output file 19 | "sample_input_file": "sample_input.txt", 20 | "sample_output_file": "sample_output.txt", 21 | } 22 | -------------------------------------------------------------------------------- /gpt-neox/configs/pile.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | "data_path": "/data/pile/pile_text_document", 4 | 5 | # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. 6 | # WARNING: setting this to True will override any user provided weights 7 | # "weight_by_num_documents": false, 8 | # "weighted_sampler_alpha": 0.3, 9 | 10 | "tokenizer_type": "HFTokenizer", 11 | "vocab_file": "/data/pile/20B_tokenizer.json", 12 | 13 | # "save": "checkpoints", 14 | # "load": "checkpoints", 15 | "checkpoint_validation_with_forward_pass": False, 16 | 17 | "tensorboard_dir": "tensorboard", 18 | "log_dir": "./logs", 19 | "use_wandb": True, 20 | "wandb_host": "https://api.wandb.ai", 21 | "wandb_project": "neox" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/1-4B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 24, 6 | "hidden_size": 2048, 7 | "num_attention_heads": 16, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 24]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0002, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00002, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 16, 46 | "gas": 1, 47 | "data_impl": "mmap", 48 | "num_workers": 1, 49 | 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.1, 57 | "hidden_dropout": 0, 58 | "attention_dropout": 0, 59 | 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train_iters": 143000, 71 | "lr_decay_iters": 143000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint_factor": 1000, 76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 77 | "eval_interval": 143000, 78 | "eval_iters": 10, 79 | 80 | 81 | "log_interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | "tokenizer_type": "HFTokenizer" 85 | } 86 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/12B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 4, 4 | 5 | "num_layers": 36, 6 | "hidden_size": 5120, 7 | "num_attention_heads": 40, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos_emb": "rotary", 12 | "rotary_pct": 0.25, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": true, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], 36]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": true, 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.00012, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.000012, 31 | 32 | "zero_optimization": { 33 | "stage": 1, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 1260000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 1260000000, 39 | "contiguous_gradients": true, 40 | "cpu_offload": false 41 | }, 42 | 43 | "train_micro_batch_size_per_gpu": 8, 44 | "gradient_accumulation_steps": 2, 45 | "data_impl": "mmap", 46 | 47 | "checkpoint_activations": true, 48 | "checkpoint_num_layers": 1, 49 | "partition_activations": true, 50 | "synchronize_each_layer": true, 51 | 52 | "gradient_clipping": 1.0, 53 | "weight_decay": 0.1, 54 | "hidden_dropout": 0, 55 | "attention_dropout": 0, 56 | 57 | "fp16": { 58 | "fp16": true, 59 | "enabled": true, 60 | "loss_scale": 0, 61 | "loss_scale_window": 1000, 62 | "initial_scale_power": 12, 63 | "hysteresis": 2, 64 | "min_loss_scale": 1 65 | }, 66 | 67 | "train_iters": 143000, 68 | "lr_decay_iters": 143000, 69 | "distributed_backend": "nccl", 70 | "lr_decay_style": "cosine", 71 | "warmup": 0.01, 72 | "checkpoint_factor": 1000, 73 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 74 | "eval_interval": 143000, 75 | "eval_iters": 10, 76 | 77 | "log_interval": 10, 78 | "steps_per_print": 10, 79 | "wall_clock_breakdown": true, 80 | 81 | "log_grad_norm": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/160M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 12, 6 | "hidden_size": 768, 7 | "num_attention_heads": 12, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 12]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0006, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00006, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 32, 46 | "gas": 1, 47 | "data_impl": "mmap", 48 | "num_workers": 1, 49 | 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.1, 57 | "hidden_dropout": 0, 58 | "attention_dropout": 0, 59 | 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train_iters": 143000, 71 | "lr_decay_iters": 143000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint_factor": 1000, 76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 77 | "eval_interval": 143000, 78 | "eval_iters": 10, 79 | 80 | "log_interval": 10, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | "tokenizer_type": "HFTokenizer" 85 | } 86 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/1B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 16, 6 | "hidden_size": 2048, 7 | "num_attention_heads": 8, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "scaled_upper_triang_masked_softmax_fusion": true, 17 | "bias_gelu_fusion": true, 18 | 19 | "init_method": "small_init", 20 | "output_layer_init_method": "wang_init", 21 | 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.00025, 26 | "betas": [0.9, 0.95], 27 | "eps": 1.0e-8 28 | } 29 | }, 30 | "min_lr": 0.000025, 31 | 32 | "zero_optimization": { 33 | "stage": 0, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 500000000, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": 500000000, 39 | "contiguous_gradients": true, 40 | "cpu_offload": false 41 | }, 42 | 43 | "fp16": { 44 | "enabled": true, 45 | "type": "bfloat16", 46 | "auto_cast": true, 47 | "loss_scale": 0, 48 | "loss_scale_window": 1000, 49 | "initial_scale_power": 12, 50 | "hysteresis": 2, 51 | "min_loss_scale": 1 52 | }, 53 | 54 | "fp32_allreduce": true, 55 | 56 | "train_micro_batch_size_per_gpu": 4, 57 | "gradient_accumulation_steps": 4, 58 | "data_impl": "mmap", 59 | "num_workers": 1, 60 | 61 | "checkpoint_activations": true, 62 | "checkpoint_num_layers": 1, 63 | "partition_activations": true, 64 | "synchronize_each_layer": true, 65 | 66 | "gradient_clipping": 1.0, 67 | "weight_decay": 0.1, 68 | "hidden_dropout": 0, 69 | "attention_dropout": 0, 70 | 71 | "train_iters": 143000, 72 | "lr_decay_iters": 143000, 73 | "distributed_backend": "nccl", 74 | "lr_decay_style": "cosine", 75 | "warmup": 0.01, 76 | "checkpoint_factor": 1000, 77 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 78 | "eval_interval": 143000, 79 | "eval_iters": 10, 80 | 81 | "log_interval": 10, 82 | "steps_per_print": 10, 83 | "wall_clock_breakdown": true, 84 | 85 | "tokenizer_type": "HFTokenizer" 86 | } 87 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/2-8B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 32, 6 | "hidden_size": 2560, 7 | "num_attention_heads": 32, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 32]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.00016, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.000016, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 8, 46 | "gradient_accumulation_steps": 2, 47 | "data_impl": "mmap", 48 | "num_workers": 1, 49 | 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.1, 57 | "hidden_dropout": 0, 58 | "attention_dropout": 0, 59 | 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train_iters": 143000, 71 | "lr_decay_iters": 143000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint_factor": 1000, 76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 77 | "eval_interval": 40000, 78 | "eval_iters": 10, 79 | 80 | "log_grad_norm": true, 81 | 82 | "log_interval": 10, 83 | "steps_per_print": 10, 84 | "wall_clock_breakdown": true, 85 | 86 | "tokenizer_type": "HFTokenizer" 87 | } 88 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/410M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 24, 6 | "hidden_size": 1024, 7 | "num_attention_heads": 16, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 24]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.0003, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.00003, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 32, 46 | "gas": 1, 47 | "data_impl": "mmap", 48 | "num_workers": 1, 49 | 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.1, 57 | "hidden_dropout": 0, 58 | "attention_dropout": 0, 59 | 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train_iters": 143000, 71 | "lr_decay_iters": 143000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint_factor": 1000, 76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 77 | "eval_interval": 143000, 78 | "eval_iters": 10, 79 | 80 | "log_interval": 10, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | "tokenizer_type": "HFTokenizer" 85 | } 86 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/6-9B.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 2, 4 | 5 | "num_layers": 32, 6 | "hidden_size": 4096, 7 | "num_attention_heads": 32, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "norm": "layernorm", 11 | "pos_emb": "rotary", 12 | "rotary_pct": 0.25, 13 | "no_weight_tying": true, 14 | "gpt_j_residual": true, 15 | "output_layer_parallelism": "column", 16 | 17 | "attention_config": [[["flash"], 32]], 18 | 19 | "scaled_upper_triang_masked_softmax_fusion": true, 20 | "bias_gelu_fusion": true, 21 | 22 | 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 0.00012, 27 | "betas": [0.9, 0.95], 28 | "eps": 1.0e-8 29 | } 30 | }, 31 | 32 | "min_lr": 0.000012, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 1260000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 1260000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 8, 46 | "gradient_accumulation_steps": 2, 47 | "data_impl": "mmap", 48 | 49 | "checkpoint_activations": true, 50 | "checkpoint_num_layers": 1, 51 | "partition_activations": true, 52 | "synchronize_each_layer": true, 53 | 54 | "gradient_clipping": 1.0, 55 | "weight_decay": 0.1, 56 | "hidden_dropout": 0, 57 | "attention_dropout": 0, 58 | 59 | "fp16": { 60 | "fp16": true, 61 | "enabled": true, 62 | "loss_scale": 0, 63 | "loss_scale_window": 1000, 64 | "initial_scale_power": 12, 65 | "hysteresis": 2, 66 | "min_loss_scale": 1 67 | }, 68 | 69 | "train_iters": 143000, 70 | "lr_decay_iters": 143000, 71 | "distributed_backend": "nccl", 72 | "lr_decay_style": "cosine", 73 | "warmup": 0.01, 74 | "checkpoint_factor": 1000, 75 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 76 | "eval_interval": 143000, 77 | "eval_iters": 10, 78 | 79 | "log_interval": 10, 80 | "steps_per_print": 10, 81 | "wall_clock_breakdown": true, 82 | 83 | "tokenizer_type": "HFTokenizer" 84 | } 85 | -------------------------------------------------------------------------------- /gpt-neox/configs/pythia/70M.yml: -------------------------------------------------------------------------------- 1 | { 2 | "pipe_parallel_size": 1, 3 | "model_parallel_size": 1, 4 | 5 | "num_layers": 6, 6 | "hidden_size": 512, 7 | "num_attention_heads": 8, 8 | "seq_length": 2048, 9 | "max_position_embeddings": 2048, 10 | "pos_emb": "rotary", 11 | "rotary_pct": 0.25, 12 | "no_weight_tying": true, 13 | "gpt_j_residual": true, 14 | "output_layer_parallelism": "column", 15 | 16 | "attention_config": [[["flash"], 6]], 17 | 18 | "scaled_upper_triang_masked_softmax_fusion": true, 19 | "bias_gelu_fusion": true, 20 | 21 | "init_method": "small_init", 22 | "output_layer_init_method": "wang_init", 23 | 24 | "optimizer": { 25 | "type": "Adam", 26 | "params": { 27 | "lr": 0.001, 28 | "betas": [0.9, 0.95], 29 | "eps": 1.0e-8 30 | } 31 | }, 32 | "min_lr": 0.0001, 33 | 34 | "zero_optimization": { 35 | "stage": 1, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 500000000, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 500000000, 41 | "contiguous_gradients": true, 42 | "cpu_offload": false 43 | }, 44 | 45 | "train_micro_batch_size_per_gpu": 32, 46 | "gas": 1, 47 | "data_impl": "mmap", 48 | "num_workers": 1, 49 | 50 | "checkpoint_activations": true, 51 | "checkpoint_num_layers": 1, 52 | "partition_activations": true, 53 | "synchronize_each_layer": true, 54 | 55 | "gradient_clipping": 1.0, 56 | "weight_decay": 0.1, 57 | "hidden_dropout": 0, 58 | "attention_dropout": 0, 59 | 60 | "fp16": { 61 | "fp16": true, 62 | "enabled": true, 63 | "loss_scale": 0, 64 | "loss_scale_window": 1000, 65 | "initial_scale_power": 12, 66 | "hysteresis": 2, 67 | "min_loss_scale": 1 68 | }, 69 | 70 | "train_iters": 143000, 71 | "lr_decay_iters": 143000, 72 | "distributed_backend": "nccl", 73 | "lr_decay_style": "cosine", 74 | "warmup": 0.01, 75 | "checkpoint_factor": 1000, 76 | "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512], 77 | "eval_interval": 100000, 78 | "eval_iters": 10, 79 | 80 | "log_interval": 10, 81 | "steps_per_print": 10, 82 | "wall_clock_breakdown": true, 83 | 84 | "tokenizer_type": "HFTokenizer" 85 | } 86 | -------------------------------------------------------------------------------- /gpt-neox/configs/slurm_local.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab-file": "data/gpt2-vocab.json", 3 | "merge-file": "data/gpt2-merges.txt", 4 | "save": "checkpoints", 5 | "checkpoint_validation_with_forward_pass": false, 6 | "tensorboard-dir": "tensorboard", 7 | "log-dir": "logs", 8 | "use_wandb": true, 9 | "wandb_host": "https://api.wandb.ai", 10 | "wandb_project": "neox" 11 | } 12 | -------------------------------------------------------------------------------- /gpt-neox/deepy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2021, EleutherAI 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | 19 | import deepspeed.launcher.runner 20 | 21 | 22 | def main(): 23 | logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) 24 | 25 | from megatron.neox_arguments import NeoXArgs 26 | from megatron.utils import get_wandb_api_key 27 | 28 | neox_args = NeoXArgs.consume_deepy_args() 29 | deepspeed_main_args = neox_args.get_deepspeed_main_args() 30 | 31 | # Extract wandb API key and inject into worker environments 32 | wandb_token = get_wandb_api_key(neox_args=neox_args) 33 | if wandb_token is not None: 34 | deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY") 35 | os.environ["WANDB_API_KEY"] = wandb_token 36 | 37 | deepspeed.launcher.runner.main(deepspeed_main_args) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /gpt-neox/eval_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness 16 | -------------------------------------------------------------------------------- /gpt-neox/hostfile: -------------------------------------------------------------------------------- 1 | node01 slots=1 2 | node02 slots=1 3 | -------------------------------------------------------------------------------- /gpt-neox/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | 17 | def print_rank_0(*message): 18 | """If distributed is initialized print only on rank 0.""" 19 | if torch.distributed.is_initialized(): 20 | if torch.distributed.get_rank() == 0: 21 | print(*message, flush=True) 22 | else: 23 | print(*message, flush=True) 24 | 25 | 26 | from .initialize import initialize_megatron 27 | from .neox_arguments import NeoXArgs 28 | -------------------------------------------------------------------------------- /gpt-neox/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /gpt-neox/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /gpt-neox/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/gpt-neox/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /gpt-neox/megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pathlib 17 | import subprocess 18 | 19 | from torch.utils import cpp_extension 20 | from pathlib import Path 21 | 22 | srcpath = Path(__file__).parent.absolute() 23 | 24 | # Setting this param to a list has a problem of generating different 25 | # compilation commands (with different order of architectures) and 26 | # leading to recompilation of fused kernels. Set it to empty string 27 | # to avoid recompilation and assign arch flags explicitly in 28 | # extra_cuda_cflags below 29 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 30 | 31 | 32 | def load_fused_kernels(): 33 | try: 34 | import scaled_upper_triang_masked_softmax_cuda 35 | import scaled_masked_softmax_cuda 36 | except (ImportError, ModuleNotFoundError) as e: 37 | print("\n") 38 | print(e) 39 | print("=" * 100) 40 | print( 41 | f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them" 42 | ) 43 | print("=" * 100) 44 | exit() 45 | return 46 | -------------------------------------------------------------------------------- /gpt-neox/megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied from NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /gpt-neox/megatron/fused_kernels/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from torch.utils import cpp_extension 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | from pathlib import Path 5 | import subprocess 6 | 7 | 8 | def _get_cuda_bare_metal_version(cuda_dir): 9 | raw_output = subprocess.check_output( 10 | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True 11 | ) 12 | output = raw_output.split() 13 | release_idx = output.index("release") + 1 14 | release = output[release_idx].split(".") 15 | bare_metal_major = release[0] 16 | bare_metal_minor = release[1][0] 17 | 18 | return raw_output, bare_metal_major, bare_metal_minor 19 | 20 | 21 | srcpath = Path(__file__).parent.absolute() 22 | cc_flag = [] 23 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 24 | if int(bare_metal_major) >= 11: 25 | cc_flag.append("-gencode") 26 | cc_flag.append("arch=compute_80,code=sm_80") 27 | 28 | nvcc_flags = [ 29 | "-O3", 30 | "-gencode", 31 | "arch=compute_70,code=sm_70", 32 | "--use_fast_math", 33 | "-U__CUDA_NO_HALF_OPERATORS__", 34 | "-U__CUDA_NO_HALF_CONVERSIONS__", 35 | "--expt-relaxed-constexpr", 36 | "--expt-extended-lambda", 37 | ] 38 | cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag} 39 | layernorm_cuda_args = { 40 | "cxx": ["-O3"], 41 | "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"], 42 | } 43 | setup( 44 | name="fused_kernels", 45 | version="0.0.1", 46 | author="Sid Black & Alejandro Molina et al.", 47 | author_email="alejandro.molina@aleph-alpha.de", 48 | include_package_data=False, 49 | ext_modules=[ 50 | CUDAExtension( 51 | "scaled_upper_triang_masked_softmax_cuda", 52 | [ 53 | str(srcpath / "scaled_upper_triang_masked_softmax.cpp"), 54 | str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"), 55 | ], 56 | extra_compile_args=cuda_ext_args, 57 | ), 58 | CUDAExtension( 59 | "scaled_masked_softmax_cuda", 60 | [ 61 | str(srcpath / "scaled_masked_softmax.cpp"), 62 | str(srcpath / "scaled_masked_softmax_cuda.cu"), 63 | ], 64 | extra_compile_args=cuda_ext_args, 65 | ), 66 | ], 67 | cmdclass={"build_ext": BuildExtension}, 68 | ) 69 | -------------------------------------------------------------------------------- /gpt-neox/megatron/gradient_noise_scale/__init__.py: -------------------------------------------------------------------------------- 1 | from .gradient_noise_scale import GradientNoiseScale 2 | -------------------------------------------------------------------------------- /gpt-neox/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from .gpt2_model import GPT2ModelPipe 19 | from .utils import get_params_for_weight_decay_optimization 20 | from .word_embeddings import SoftEmbedding 21 | -------------------------------------------------------------------------------- /gpt-neox/megatron/model/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI contributors 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from typing import Optional 19 | from torch import Tensor 20 | 21 | # flags required to enable jit fusion kernels 22 | torch._C._jit_set_profiling_mode(False) 23 | torch._C._jit_set_profiling_executor(False) 24 | torch._C._jit_override_can_fuse_on_cpu(True) 25 | torch._C._jit_override_can_fuse_on_gpu(True) 26 | 27 | 28 | def bias_dropout_add( 29 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool 30 | ) -> Tensor: 31 | out = torch.nn.functional.dropout(x + bias, p=prob, training=training) 32 | if residual is not None: 33 | out = residual + out 34 | return out 35 | 36 | 37 | def get_bias_dropout_add(training): 38 | def _bias_dropout_add(x, bias, residual, prob): 39 | return bias_dropout_add(x, bias, residual, prob, training) 40 | 41 | return _bias_dropout_add 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_fused_train( 46 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add(x, bias, residual, prob, True) 49 | 50 | 51 | @torch.jit.script 52 | def bias_dropout_add_fused_inference( 53 | x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float 54 | ) -> Tensor: 55 | return bias_dropout_add(x, bias, residual, prob, False) 56 | -------------------------------------------------------------------------------- /gpt-neox/megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Model parallel utility interface.""" 16 | 17 | from .cross_entropy import vocab_parallel_cross_entropy 18 | 19 | from .data import broadcast_data 20 | 21 | from .initialize import is_unitialized 22 | from .initialize import destroy_model_parallel 23 | from .initialize import get_data_parallel_group 24 | from .initialize import get_data_parallel_rank 25 | from .initialize import get_data_parallel_world_size 26 | from .initialize import get_model_parallel_group 27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank 29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 30 | from .initialize import get_topology 31 | from .initialize import get_pipe_parallel_group 32 | from .initialize import get_pipe_parallel_rank 33 | from .initialize import get_pipe_parallel_world_size 34 | from .initialize import get_io_parallel_group 35 | from .initialize import initialize_model_parallel 36 | from .initialize import model_parallel_is_initialized 37 | 38 | from .layers import ColumnParallelLinear 39 | from .layers import RowParallelLinear 40 | from .layers import VocabParallelEmbedding 41 | from .layers import ParallelRelativePositionBias 42 | 43 | from .mappings import copy_to_model_parallel_region 44 | from .mappings import gather_from_model_parallel_region 45 | from .mappings import reduce_from_model_parallel_region 46 | from .mappings import scatter_to_model_parallel_region 47 | 48 | from .random import checkpoint 49 | from .random import get_cuda_rng_tracker 50 | from .random import model_parallel_cuda_manual_seed 51 | 52 | from .utils import divide 53 | from .utils import split_tensor_along_last_dim 54 | -------------------------------------------------------------------------------- /gpt-neox/megatron/mpu/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports 16 | # TODO: should be able to get rid of this file entirely 17 | 18 | import deepspeed 19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing 20 | 21 | # Default name for the model parallel rng tracker. 22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = ( 23 | deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME 24 | ) 25 | 26 | # Whether apply model parallelsim to checkpointed hidden states. 27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None 28 | 29 | # RNG tracker object. 30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER 31 | 32 | # Deepspeed checkpointing functions 33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones 34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state 35 | checkpoint = checkpointing.checkpoint 36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed 37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker 38 | -------------------------------------------------------------------------------- /gpt-neox/megatron/neox_arguments/template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | import logging 17 | 18 | 19 | @dataclass 20 | class NeoXArgsTemplate: 21 | def defaults(self): 22 | """ 23 | generator for getting default values. 24 | """ 25 | for key, field_def in self.__dataclass_fields__.items(): 26 | yield key, field_def.default 27 | 28 | def update_value(self, key: str, value): 29 | """ 30 | updates a property value if the key already exists 31 | 32 | Problem: a previously non-existing property can be added to the class instance without error. 33 | """ 34 | if hasattr(self, key): 35 | setattr(self, key, value) 36 | else: 37 | error_message = ( 38 | self.__class__.__name__ 39 | + ".update_value() to be updated property " 40 | + str(key) 41 | + " does not exist" 42 | ) 43 | logging.error(error_message) 44 | raise ValueError(error_message) 45 | 46 | def update_values(self, d): 47 | """ 48 | Updates multiple values in self if the keys already exists 49 | """ 50 | for k, v in d.items(): 51 | self.update_value(k, v) 52 | -------------------------------------------------------------------------------- /gpt-neox/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .tokenizer import build_tokenizer 17 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8>=1.5.6 2 | clang-format>=13.0.1 3 | pre-commit>=2.17.0 4 | pytest>=6.2.3 5 | pytest-cov>=2.11.1 6 | pytest-forked>=1.3.0 7 | pytest-xdist 8 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-flashattention.txt: -------------------------------------------------------------------------------- 1 | flash-attn==2.2.1 2 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-onebitadam.txt: -------------------------------------------------------------------------------- 1 | cupy-cuda111>=8.6.0 2 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-s3.txt: -------------------------------------------------------------------------------- 1 | hf-transfer>=0.1.3 2 | boto3 -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-sparseattention.txt: -------------------------------------------------------------------------------- 1 | triton==2.0.0.dev20221202 2 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-tensorboard.txt: -------------------------------------------------------------------------------- 1 | tensorboard==2.13.0 2 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements-wandb.txt: -------------------------------------------------------------------------------- 1 | wandb>=0.10.28 2 | -------------------------------------------------------------------------------- /gpt-neox/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | best_download 2 | git+https://github.com/EleutherAI/DeeperSpeed.git#egg=deepspeed 3 | ftfy>=6.0.1 4 | git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 5 | huggingface_hub>=0.11.0 6 | lm_eval==0.3.0 7 | mpi4py>=3.0.3 8 | numpy>=1.22.0 9 | pybind11>=2.6.2 10 | regex 11 | sentencepiece 12 | six 13 | tiktoken>=0.1.2 14 | tokenizers>=0.12.1 15 | transformers==4.30.2 16 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/0.remove_nvidia_driver_and_cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get purge nvidia* 4 | sudo apt-get autoremove 5 | sudo apt-get autoclean 6 | sudo rm -rf /usr/local/cuda* 7 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/1.cuda_11_7_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt update 4 | sudo apt install wget axel 5 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin 6 | sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | axel -a -n 20 https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-ubuntu2004-11-7-local_11.7.0-515.43.04-1_amd64.deb 8 | sudo dpkg -i cuda-repo-ubuntu2004-11-7-local_11.7.0-515.43.04-1_amd64.deb 9 | sudo cp /var/cuda-repo-ubuntu2004-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/ 10 | sudo apt-get update 11 | sudo apt-get -y install nvidia-driver-515 12 | echo "blacklist nouveau" | sudo tee /etc/modprobe.d/blacklist-nouveau.conf 13 | echo "options nouveau modeset=0" | sudo tee -a /etc/modprobe.d/blacklist-nouveau.conf 14 | sudo update-initramfs -u 15 | sudo apt-get -y install cuda-11-7 16 | 17 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/11.cat_csv_from_log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $1 ]; then 4 | echo "Usage: $0 [log_file]" 5 | exit 0 6 | fi 7 | 8 | if [ ! -f $1 ]; then 9 | echo "$1 is missing." 10 | exit 0 11 | fi 12 | 13 | #grep -a lm_loss $1 | awk '{printf "%s,%s,%s,%f\n",substr($8, 1, length($8)-1),$2,substr($26,1,length($26)-6),$29}' | tee $1.csv 14 | grep -a lm_loss $1 | awk '{printf "%s,%f\n",$2,$5}' | tee $1.csv 15 | grep -a lm_loss $1 | awk '{printf "%s,%s,%f\n",substr($8,1,length($8)-1),substr($26,1,length($26)-6),$35}' | tee $1.csv 16 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/12.run_and_collect_logs_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #CONFIGS=(125M.yml 6-7B.yml) 4 | CONFIG=6-7B.yml 5 | CONT_NAME="gpt-neox-container" 6 | #BATCHS=(1 2 4 8 16 32 64 128) 7 | BATCHS=(1 2 4 8 16 32) 8 | #GPUS=(1 2 3 4 5 6 7 8) 9 | #gpus per node 10 | GPUS=4 11 | NODES=(s1 s8) 12 | #Pipeline Parallel 13 | PP=(1 2 4 8) 14 | #PP=(8) 15 | TRAIN_ITERS=500 16 | TARGET_LM_LOSS=0 17 | TRAIN_TIME=600 18 | HOSTFILE=./scripts_swsok/hostfile 19 | 20 | if [ ! -z "$1" ]; then 21 | GPUS=$1 22 | fi 23 | 24 | rm logs/* 25 | rm checkpoints/* -rf 26 | mkdir swsok-results/ 27 | mkdir checkpoints/ 28 | 29 | rm $HOSTFILE 30 | for i in ${NODES[@]}; do 31 | echo "$i slots=$GPUS" >> $HOSTFILE 32 | done 33 | 34 | sed -i "/\"train_iters\"/c\ \"train_iters\": \\$TRAIN_ITERS," configs/$CONFIG 35 | sed -i "/\"lr_decay_iters\"/c\ \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$CONFIG 36 | sed -i "/\"target_lm_loss\"/c\ \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$CONFIG 37 | sed -i "/\"target_time_in_sec\"/c\ \"target_time_in_sec\": \\$TRAIN_TIME," configs/$CONFIG 38 | 39 | for i in ${NODES[@]}; do 40 | ssh $i docker stop $CONT_NAME 41 | ssh $i docker run -d -it --name $CONT_NAME --rm --network host --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=/var/nfs,dst=/var/nfs swsok/gpt-neox:v7 42 | done 43 | 44 | for p in ${PP[@]}; do 45 | for b in ${BATCHS[@]}; do 46 | echo "$CONFIG Nodes ${#NODES[@]} GPUS $GPUS BATCH $b Pipeline $p" > logs/current_test_setting.txt 47 | 48 | sed -i "/\"train_micro_batch_size_per_gpu\"/c\ \"train_micro_batch_size_per_gpu\": \\$b," configs/$CONFIG 49 | sed -i "/\"pipe_parallel_size\"/c\ \"pipe_parallel_size\": \\$p," configs/$CONFIG 50 | 51 | docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$CONFIG configs/local_setup.yml configs/etri_cluster.yml 52 | 53 | mv logs/*stdout.txt swsok-results/conf-$CONFIG-gpus-$GPUS-pp-$p-microbatch-$b-$(date '+%Y-%m-%d').txt 54 | rm logs/* 55 | rm checkpoints/* -rf 56 | 57 | sleep 1 58 | done 59 | done 60 | 61 | for i in ${NODES[@]}; do 62 | ssh $i docker stop $CONT_NAME 63 | done 64 | 65 | 66 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/15.long_seqlen_1.3B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONFIG=760M-32k-len-conf.yml 4 | CONT_NAME="gpt-neox-container" 5 | BATCHS=(1 2) 6 | GPUS=8 7 | #SEQLEN=(2048 4096 8192 16384 32768) 8 | SEQLEN=(32768) 9 | #SEQLEN=(32768) 10 | TRAIN_ITERS=20 11 | TARGET_LM_LOSS=0 12 | TRAIN_TIME=1000 13 | GRADACCSTEP=(32 64) 14 | 15 | docker stop $CONT_NAME 16 | sudo rm logs/* -rf 17 | rm checkpoints/* -rf 18 | 19 | i=$GPUS 20 | conf=$CONFIG 21 | 22 | #docker run -d -it --name $CONT_NAME --rm --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 23 | docker run -d -it --name $CONT_NAME --rm --gpus $i -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 24 | 25 | 26 | sed -i "/\"train_iters\"/c\ \"train_iters\": \\$TRAIN_ITERS," configs/$conf 27 | sed -i "/\"lr_decay_iters\"/c\ \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$conf 28 | #sed -i "/\"target_lm_loss\"/c\ \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$conf 29 | #sed -i "/\"target_time_in_sec\"/c\ \"target_time_in_sec\": \\$TRAIN_TIME," configs/$conf 30 | 31 | for b in ${BATCHS[@]}; do 32 | sed -i "/\"train_micro_batch_size_per_gpu\"/c\ \"train_micro_batch_size_per_gpu\": \\$b," configs/$conf 33 | 34 | for s in ${SEQLEN[@]}; do 35 | echo "$conf GPU $i microbatch $b pp $p mp $m" > logs/current_test_setting.txt 36 | 37 | sed -i "/\"seq_length\"/c\ \"seq_length\": \\$s," configs/$conf 38 | sed -i "/\"max_position_embeddings\"/c\ \"max_position_embeddings\": \\$s," configs/$conf 39 | 40 | for g in ${GRADACCSTEP[@]}; do 41 | sed -i "/\"gradient_accumulation_steps\"/c\ \"gradient_accumulation_steps\": \\$g," configs/$conf 42 | 43 | docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$conf configs/enwik8.yml 44 | sudo mv -f logs/*stdout.txt swsok-results/conf-$conf-gpunum-$i-zero-3-microbatch-$b-seqlen-$s-gradaccustep-$g-$(date '+%Y-%m-%d').txt 45 | sudo rm -rf logs/* 46 | rm checkpoints/* -rf 47 | done 48 | done 49 | 50 | sleep 1 51 | done 52 | 53 | docker stop $CONT_NAME 54 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/16.zero_opt_stages_1.3B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONFIG=760M-32k-len-conf.yml 4 | CONT_NAME="gpt-neox-container" 5 | BATCHS=(2) 6 | GPUS=8 7 | #SEQLEN=(2048 4096 8192 16384 32768) 8 | SEQLEN=(32768) 9 | #SEQLEN=(32768) 10 | TRAIN_ITERS=10 11 | TARGET_LM_LOSS=0 12 | TRAIN_TIME=1000 13 | GRADACCSTEP=(8 16 32 64) 14 | ZERO_STAGE=(2 3) 15 | 16 | docker stop $CONT_NAME 17 | sudo rm logs/* -rf 18 | rm checkpoints/* -rf 19 | 20 | i=$GPUS 21 | conf=$CONFIG 22 | 23 | #docker run -d -it --name $CONT_NAME --rm --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 24 | docker run -d -it --name $CONT_NAME --rm --gpus $i -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 25 | 26 | 27 | sed -i "/\"train_iters\"/c\ \"train_iters\": \\$TRAIN_ITERS," configs/$conf 28 | sed -i "/\"lr_decay_iters\"/c\ \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$conf 29 | #sed -i "/\"target_lm_loss\"/c\ \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$conf 30 | #sed -i "/\"target_time_in_sec\"/c\ \"target_time_in_sec\": \\$TRAIN_TIME," configs/$conf 31 | 32 | for b in ${BATCHS[@]}; do 33 | sed -i "/\"train_micro_batch_size_per_gpu\"/c\ \"train_micro_batch_size_per_gpu\": \\$b," configs/$conf 34 | 35 | for s in ${ZERO_STAGE[@]}; do 36 | echo "$conf GPU $i microbatch $b pp $p mp $m" > logs/current_test_setting.txt 37 | 38 | sed -i "/\"stage\"/c\ \"stage\": \\$s," configs/$conf 39 | 40 | for g in ${GRADACCSTEP[@]}; do 41 | sed -i "/\"gradient_accumulation_steps\"/c\ \"gradient_accumulation_steps\": \\$g," configs/$conf 42 | 43 | docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$conf configs/enwik8.yml 44 | sudo mv -f logs/*stdout.txt swsok-results/conf-$conf-gpunum-$i-zero-3-microbatch-$b-seq-32k-stage-$s-gradaccustep-$g-$(date '+%Y-%m-%d').txt 45 | sudo rm -rf logs/* 46 | rm checkpoints/* -rf 47 | done 48 | done 49 | 50 | sleep 1 51 | done 52 | 53 | docker stop $CONT_NAME 54 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/17.760M_zero_stages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONFIG=760M-32k-len-conf.yml 4 | CONT_NAME="gpt-neox-container" 5 | BATCHS=(1 2 4 8) 6 | GPUS=1 7 | #SEQLEN=(2048 4096 8192 16384 32768) 8 | SEQLEN=(32768) 9 | #SEQLEN=(32768) 10 | TRAIN_ITERS=10 11 | TARGET_LM_LOSS=0 12 | TRAIN_TIME=1000 13 | GRADACCSTEP=(8 16 32 64) 14 | ZERO_STAGE=(0 1 2 3) 15 | 16 | docker stop $CONT_NAME 17 | sudo rm logs/* -rf 18 | rm checkpoints/* -rf 19 | 20 | i=$GPUS 21 | conf=$CONFIG 22 | 23 | #docker run -d -it --name $CONT_NAME --rm --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 24 | docker run -d -it --name $CONT_NAME --rm --gpus $i -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 25 | 26 | 27 | sed -i "/\"train_iters\"/c\ \"train_iters\": \\$TRAIN_ITERS," configs/$conf 28 | sed -i "/\"lr_decay_iters\"/c\ \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$conf 29 | #sed -i "/\"target_lm_loss\"/c\ \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$conf 30 | #sed -i "/\"target_time_in_sec\"/c\ \"target_time_in_sec\": \\$TRAIN_TIME," configs/$conf 31 | 32 | for b in ${BATCHS[@]}; do 33 | sed -i "/\"train_micro_batch_size_per_gpu\"/c\ \"train_micro_batch_size_per_gpu\": \\$b," configs/$conf 34 | 35 | for s in ${ZERO_STAGE[@]}; do 36 | echo "$conf GPU $i microbatch $b pp $p mp $m" > logs/current_test_setting.txt 37 | 38 | sed -i "/\"stage\"/c\ \"stage\": \\$s," configs/$conf 39 | 40 | for g in ${GRADACCSTEP[@]}; do 41 | sed -i "/\"gradient_accumulation_steps\"/c\ \"gradient_accumulation_steps\": \\$g," configs/$conf 42 | 43 | docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$conf configs/enwik8.yml 44 | sudo mv -f logs/*stdout.txt swsok-results/conf-$conf-gpunum-$i-zero-3-microbatch-$b-seq-32k-stage-$s-gradaccustep-$g-$(date '+%Y-%m-%d').txt 45 | sudo rm -rf logs/* 46 | rm checkpoints/* -rf 47 | done 48 | done 49 | 50 | sleep 1 51 | done 52 | 53 | docker stop $CONT_NAME 54 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/2.docker_and_nvidia_container_toolkit_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #remove docker and reinstall 4 | for pkg in docker.io docker-doc docker-compose docker-compose-v2 podman-docker containerd runc; do 5 | sudo apt-get remove $pkg; 6 | done 7 | 8 | sudo apt-get update 9 | sudo apt-get install ca-certificates curl gnupg 10 | sudo install -m 0755 -d /etc/apt/keyrings 11 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg 12 | sudo chmod a+r /etc/apt/keyrings/docker.gpg 13 | 14 | # Add the repository to Apt sources: 15 | echo \ 16 | "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ 17 | "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ 18 | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 19 | 20 | sudo apt-get update 21 | 22 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin 23 | 24 | #install nvidia-container-toolkit 25 | curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 26 | curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ 27 | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ 28 | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list 29 | sudo apt-get update 30 | sudo apt-get install -y nvidia-container-toolkit 31 | 32 | sudo usermod -aG docker $USER 33 | sudo service docker restart 34 | #logout and login to run docker without sudo 35 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/3.required_packages_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt install libopenmpi-dev 4 | pip install mpi4py 5 | # urllib3 v2 conflicts with original gpt-neox codes 6 | #pip uninstall urllib3 7 | #pip install urllib3==1.26.16 8 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/4.requirements_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install -r requirements/requirements.txt 4 | pip install -r requirements/requirements-wandb.txt 5 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/5.prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=~/.local/lib/python3.8/site-packages:/usr/lib/python3/dist-packages 4 | 5 | python prepare_data.py -d ./data 6 | 7 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/6.pretrain_125M_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./deepy.py train.py configs/125M.yml configs/local_setup.yml 4 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/7.patch_best_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sed -i 's/requests.packages.urllib3.util.retry/urllib3.util.retry/g' ~/.local/lib/python3.8/site-packages/best_download/__init__.py 4 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/8.print_loss_progress.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | grep lm_loss logs/gptneox-test_stdout.txt | awk '{print $26}' 4 | 5 | #watch -n 10 "grep lm_loss gptneox-test_stdout.txt | awk '{print \$5\$6\" \"\$26}' | tail -n 10" 6 | #grep lm_loss gptneox-test_stdout.txt | awk '{print $5 $6 " " $23"/222.2TFLOPS" " " $26}' 7 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/9.run_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #docker run --rm -it --network host --gpus=all -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=/var/nfs,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 4 | docker run --rm -it --network host --gpus=all -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8 5 | -------------------------------------------------------------------------------- /gpt-neox/scripts_swsok/run_sshd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo /etc/init.d/ssh start 4 | 5 | /bin/bash 6 | -------------------------------------------------------------------------------- /gpt-neox/tests/README.md: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | 3 | Tests use pytests with coverage and forked plugins. Install with: 4 | 5 | ```bash 6 | pip install -r requirements/requirements-dev.txt 7 | ``` 8 | 9 | Download the required test data 10 | ```bash 11 | python prepare_data.py 12 | ``` 13 | 14 | # Run 15 | 16 | Tests can be run using pytest. 17 | 18 | * The argument --forked needs to be provided 19 | * A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation) 20 | * A subset of tests can be selected by pointing to the module within tests 21 | 22 | ```bash 23 | # run all tests, output coverage report of megatron module in terminal 24 | pytest --forked --cov-report term --cov=megatron tests 25 | 26 | # run tests in tests/model, output coverage report of megatron module as html 27 | pytest --forked --cov-report html --cov=megatron tests/model 28 | 29 | # run tests in tests/model/test_model_generation.py, don't output coverage report 30 | pytest --forked tests/model/test_model_generation.py 31 | ``` 32 | 33 | Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu. 34 | The test cases for cpu can be run with: 35 | ```` 36 | pytest tests -m cpu 37 | ``` 38 | 39 | If a html coverage report has been created a simple http server can be run to serve static files. 40 | 41 | ```bash 42 | python -m http.server --directory htmlcov 8000 43 | ``` 44 | 45 | 46 | ## Tips and Tricks 47 | if You see this kind of error: 48 | ``` 49 | RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method 50 | ``` 51 | It means that you used some pytorch.cuda function before the test creates the processes. 52 | -------------------------------------------------------------------------------- /gpt-neox/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/gpt-neox/tests/__init__.py -------------------------------------------------------------------------------- /gpt-neox/tests/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .test_model_instantiation import run_test_model_instantiation 16 | from .test_model_train import run_train_test 17 | from .test_model_checkpoint import run_checkpoint_test 18 | -------------------------------------------------------------------------------- /gpt-neox/tests/neox_args/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | testing of implementation of command line arguments and configuration (NeoXArgs) 3 | """ 4 | -------------------------------------------------------------------------------- /gpt-neox/tests/neox_args/test_neoxargs_implementation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | check implementation of NeoXArgs for duplication errors (would overwrite) 17 | """ 18 | import pytest 19 | 20 | 21 | @pytest.mark.cpu 22 | def test_neoxargs_duplicates(): 23 | """ 24 | tests that there are no duplicates among parent classes of NeoXArgs 25 | """ 26 | from megatron import NeoXArgs 27 | 28 | assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates" 29 | -------------------------------------------------------------------------------- /gpt-neox/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [pytest] 16 | markers = 17 | cpu: marks tests that can be run on cpu 18 | -------------------------------------------------------------------------------- /gpt-neox/tools/README.md: -------------------------------------------------------------------------------- 1 | # GPT-NeoX Auxiliary Tools 2 | 3 | This directory contains a number of auxiliary tools that are useful for working with GPT-NeoX but not part of the main training code. 4 | 5 | ## Bash 6 | 7 | This directory contains some simple, frequently used bash commands to make working on multiple machines easier. 8 | 9 | ## Checkpoints 10 | 11 | This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x. 12 | 13 | ## Datasets 14 | 15 | This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library. 16 | -------------------------------------------------------------------------------- /gpt-neox/tools/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /gpt-neox/tools/bash/README.md: -------------------------------------------------------------------------------- 1 | # Bash Scripts 2 | Useful for running distributed per-node scripts on e.g. Kubernetes 3 | 4 | * `kill.sh` kills all python processes 5 | * `killall.sh` uses pdsh to kill all `train.py` processes on the nodes listed in `/job/hosts/` 6 | * `sync_cmd.sh` uses pdsh to run a command on all the nodes listed in `/job/hosts/` 7 | * `sync.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/` 8 | * `syncdir.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/` 9 | -------------------------------------------------------------------------------- /gpt-neox/tools/bash/kill.sh: -------------------------------------------------------------------------------- 1 | pkill -9 python 2 | -------------------------------------------------------------------------------- /gpt-neox/tools/bash/killall.sh: -------------------------------------------------------------------------------- 1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py' 2 | -------------------------------------------------------------------------------- /gpt-neox/tools/bash/sync.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # sync.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | echo Uploading $full_path 27 | pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path 28 | done 29 | -------------------------------------------------------------------------------- /gpt-neox/tools/bash/sync_cmd.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Runs a command in parallel across all nodes 18 | # Usage 19 | # sync_cmd.sh 'echo "hello world"' 20 | 21 | echo "Command: $1"; 22 | pdsh -R ssh -w ^/job/hosts $1 23 | -------------------------------------------------------------------------------- /gpt-neox/tools/bash/syncdir.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # syncdir.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | parentdir="$(dirname "$full_path")" 27 | echo Uploading $full_path to $parentdir 28 | pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir 29 | done 30 | -------------------------------------------------------------------------------- /gpt-neox/tools/ckpts/upload.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | 18 | from huggingface_hub import HfApi, create_repo 19 | 20 | converted_ckpt = sys.argv[1] 21 | repo_name = sys.argv[2] 22 | branch_name = sys.argv[3] 23 | try: 24 | create_repo(repo_name, repo_type="model", private=False) 25 | except: 26 | print("repo {repo_name} already exists!") 27 | pass 28 | 29 | files = os.listdir(converted_ckpt) 30 | 31 | api = HfApi() 32 | if branch_name != "main": 33 | try: 34 | api.create_branch( 35 | repo_id=repo_name, 36 | repo_type="model", 37 | branch=branch_name, 38 | ) 39 | except: 40 | print(f"branch {branch_name} already exists, try again...") 41 | print(f"to upload: {files}") 42 | for file in files: 43 | print(f"Uploading {file} to branch {branch_name}...") 44 | api.upload_file( 45 | path_or_fileobj=os.path.join(converted_ckpt, file), 46 | path_in_repo=file, 47 | repo_id=repo_name, 48 | repo_type="model", 49 | commit_message=f"Upload {file}", 50 | revision=branch_name, 51 | ) 52 | print(f"Successfully uploaded {file} !") 53 | -------------------------------------------------------------------------------- /gpt-neox/tools/kill.sh: -------------------------------------------------------------------------------- 1 | pkill -9 python 2 | -------------------------------------------------------------------------------- /gpt-neox/tools/killall.sh: -------------------------------------------------------------------------------- 1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py' 2 | -------------------------------------------------------------------------------- /gpt-neox/tools/sync.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # sync.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | echo Uploading $full_path 27 | pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path 28 | done 29 | -------------------------------------------------------------------------------- /gpt-neox/tools/sync_cmd.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Runs a command in parallel across all nodes 18 | # Usage 19 | # sync_cmd.sh 'echo "hello world"' 20 | 21 | echo "Command: $1"; 22 | pdsh -R ssh -w ^/job/hosts $1 23 | -------------------------------------------------------------------------------- /gpt-neox/tools/syncdir.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env bash 16 | 17 | # Push files to all nodes 18 | # Usage 19 | # sync.sh file [file2..] 20 | 21 | echo Number of files to upload: $# 22 | 23 | for file in "$@" 24 | do 25 | full_path=$(realpath $file) 26 | parentdir="$(dirname "$full_path")" 27 | echo Uploading $full_path to $parentdir 28 | pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir 29 | done 30 | -------------------------------------------------------------------------------- /gpt-neox/tools/upload.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | 18 | from huggingface_hub import HfApi, create_repo 19 | 20 | converted_ckpt = sys.argv[1] 21 | repo_name = sys.argv[2] 22 | branch_name = sys.argv[3] 23 | try: 24 | create_repo(repo_name, repo_type="model", private=False) 25 | except: 26 | print("repo {repo_name} already exists!") 27 | pass 28 | 29 | files = os.listdir(converted_ckpt) 30 | 31 | api = HfApi() 32 | if branch_name != "main": 33 | try: 34 | api.create_branch( 35 | repo_id=repo_name, 36 | repo_type="model", 37 | branch=branch_name, 38 | ) 39 | except: 40 | print(f"branch {branch_name} already exists, try again...") 41 | print(f"to upload: {files}") 42 | for file in files: 43 | print(f"Uploading {file} to branch {branch_name}...") 44 | api.upload_file( 45 | path_or_fileobj=os.path.join(converted_ckpt, file), 46 | path_in_repo=file, 47 | repo_id=repo_name, 48 | repo_type="model", 49 | commit_message=f"Upload {file}", 50 | revision=branch_name, 51 | ) 52 | print(f"Successfully uploaded {file} !") 53 | -------------------------------------------------------------------------------- /gpt-neox/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Train""" 19 | from megatron.neox_arguments import NeoXArgs 20 | from megatron.training import pretrain 21 | 22 | if __name__ == "__main__": 23 | neox_args = NeoXArgs.consume_neox_args() 24 | neox_args.configure_distributed_args() 25 | neox_args.build_tokenizer() # tokenizer needs to be build in training in order to set the padding vocab 26 | neox_args.initialize_tensorboard_writer() # is initialized if tensorboard directory is defined 27 | pretrain(neox_args=neox_args) 28 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/00-prepare-nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt update 4 | sudo apt upgrade -y 5 | 6 | #disable swap 7 | sudo swapoff -a 8 | sudo sed -e '/swap/ s/^#*/#/' -i /etc/fstab 9 | 10 | nodelistfile='nodes.txt' 11 | USER=etri-aicomputing 12 | 13 | if [ -e $nodelistfile ]; then 14 | while read p; do 15 | ssh-copy-id $USER@$p 16 | #for passwordless sudo 17 | #ssh $USER@$p sudo bash -c 'echo "etri-aicomputing ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers' 18 | done < "$nodelistfile" 19 | fi 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/01-install-cudnn-and-nvidia-driver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get -y update 4 | sudo apt-get -y remove --purge '^nvidia-.*' 5 | sudo apt-get -y remove --purge 'cuda-.*' 6 | 7 | sudo bash -c "echo blacklist nouveau > /etc/modprobe.d/blacklist-nvidia-nouveau.conf" 8 | sudo bash -c "echo options nouveau modeset=0 >> /etc/modprobe.d/blacklist-nvidia-nouveau.conf" 9 | sudo update-initramfs -u 10 | 11 | sudo apt-get -y install nvidia-cuda-toolkit 12 | nvcc -V 13 | whereis cuda 14 | #mkdir ~/nvidia 15 | #cd ~/nvidia 16 | CUDNN_DEB_FILE="cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb" 17 | if ! [ -e $CUDNN_DEB_FILE ]; then 18 | sudo apt-get -y install axel 19 | axel -n 20 https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/${CUDNN_DEB_FILE} 20 | <<<<<<< HEAD 21 | ======= 22 | # wget https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/${CUDNN_DEB_FILE} 23 | >>>>>>> 1fa1c6b77722000453667cc07bfd33920d5e633e 24 | fi 25 | sudo dpkg -i ${CUDNN_DEB_FILE} 26 | sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/ 27 | sudo apt update 28 | sudo apt -y install libcudnn8=8.8.0.121-1+cuda12.0 29 | sudo apt -y install libcudnn8-dev=8.8.0.121-1+cuda12.0 30 | sudo apt -y install libcudnn8-samples=8.8.0.121-1+cuda12.0 31 | 32 | source ~/.bashrc 33 | 34 | sudo apt install -y ubuntu-drivers-common 35 | ubuntu-drivers devices 36 | sudo apt install -y nvidia-driver-525-server 37 | 38 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/02-install-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg lsb-release 3 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg 4 | echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 5 | sudo apt-get update -y 6 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io 7 | sudo docker run hello-world 8 | sudo usermod -aG docker $USER 9 | #&& newgrp docker 10 | sudo service docker restart 11 | 12 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/03-install-nvidia-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 3 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - 4 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 5 | sudo apt-get -y update 6 | sudo apt-get -y install nvidia-docker2 7 | sudo systemctl restart docker 8 | sudo docker run --runtime nvidia nvidia/cuda:10.1-base /usr/bin/nvidia-smi 9 | 10 | sudo bash -c 'cat < /etc/docker/daemon.json 11 | { 12 | "exec-opts": ["native.cgroupdriver=systemd"], 13 | "log-driver": "json-file", 14 | "log-opts": { 15 | "max-size": "100m" 16 | }, 17 | "data-root": "/mnt/storage/docker_data", 18 | "storage-driver": "overlay2", 19 | "default-runtime" : "nvidia", 20 | "runtimes" : { 21 | "nvidia" : { 22 | "path": "/usr/bin/nvidia-container-runtime", 23 | "runtimeArgs" : [] 24 | } 25 | } 26 | } 27 | EOF' 28 | sudo systemctl restart docker 29 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/04-install-k8s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo swapoff -a 3 | sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab 4 | 5 | sudo apt-get install -y iptables arptables ebtables 6 | sudo apt-get update && sudo apt-get install -y apt-transport-https curl 7 | curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - 8 | cat <>$shellconf 52 | echo 'alias k=kubectl' >>$shellconf 53 | echo 'complete -F __start_kubectl k' >>$shellconf 54 | fi 55 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/06-install-kubeflow-master-only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ~ 4 | git clone https://github.com/kubeflow/manifests.git 5 | cd manifests 6 | git checkout v1.6.0 7 | 8 | kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - 9 | kubectl wait --for=condition=ready pod -l 'app in (cert-manager,webhook)' --timeout=180s -n cert-manager 10 | kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f - 11 | 12 | while ! kustomize build example | awk '!/well-defined/' | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done 13 | 14 | # wait until all pods become Running state 15 | watch kubectl get pod -A 16 | 17 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/07-certificate-kubeflow-master-only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl apply -f gateway.yaml 4 | kubectl apply -f certificate.yaml 5 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/08-port-forward-kubeflow-master-only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nohup kubectl port-forward --address="0.0.0.0" svc/istio-ingressgateway -n istio-system 8080:443 & 4 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/09-print-join-cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$(kubeadm token create --print-join-command) 4 | echo "sudo $cmd" 5 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/10-enable-k8s-dashboard-master-only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #kubectl apply -f dashboard-adminuser.yaml 4 | 5 | #kubectl apply -f cluster-role-binding.yaml 6 | 7 | #kubectl -n kubernetes-dashboard describe secrets 8 | # copy admin-user's token value for kubernetes-dashboard 9 | 10 | kubectl create serviceaccount admin-user 11 | kubectl create clusterrolebinding test-user-binding --clusterrole=cluster-admin --serviceaccount=default:admin-user 12 | kubectl get secrets 13 | #specify admin-user's token name in next cmd 14 | kubectl describe secret admin-user-token-8bjnr 15 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/11-reset-k8s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo kubeadm reset 4 | rm ~/.kube/config 5 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/12-add-kubeflow-user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl create -f profile.yaml 4 | 5 | #kubectl apply -f profile.yaml #if you are modifying the profile 6 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/13-port-forward-k8s-container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl port-forward test222-0 60022:22 -n aicomputing1 4 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/14-remove-a-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $1 ]; then 4 | echo "Usage: $0 [node name to remove]" 5 | exit 0 6 | fi 7 | 8 | kubectl drain $1 --delete-local-data --force --ignore-daemonsets 9 | kubectl delete node $1 10 | 11 | ssh $1 "sudo kubeadm reset" 12 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/certificate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | name: kubeflow-ingressgateway-certs 5 | namespace: istio-system 6 | spec: 7 | commonName: example.com #Domain name 8 | issuerRef: 9 | kind: ClusterIssuer 10 | name: kubeflow-self-signing-issuer 11 | secretName: kubeflow-ingressgateway-certs 12 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: admin-user 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: cluster-admin 9 | subjects: 10 | - kind: ServiceAccount 11 | name: admin-user 12 | namespace: kubernetes-dashboard 13 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/dashboard-adminuser.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: admin-user 5 | namespace: kubernetes-dashboard 6 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: Gateway 3 | metadata: 4 | name: kubeflow-gateway 5 | namespace: kubeflow 6 | spec: 7 | selector: 8 | istio: ingressgateway 9 | servers: 10 | - hosts: 11 | - "*" 12 | port: 13 | name: http 14 | number: 80 15 | protocol: HTTP 16 | # Upgrade HTTP to HTTPS 17 | tls: 18 | httpsRedirect: true 19 | - hosts: 20 | - "*" 21 | port: 22 | name: https 23 | number: 443 24 | protocol: HTTPS 25 | tls: 26 | mode: SIMPLE 27 | credentialName: kubeflow-ingressgateway-certs 28 | 29 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing1 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing1@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile1.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing1 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing1@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing2 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing2@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing3 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing3@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile4.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing4 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing4@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile5.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing5 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing5@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile6.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing6 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing6@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile7.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing7 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing7@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile8.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing8 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing8@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/common/profile9.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: aicomputing9 5 | #replace with the name of profile you want, this will be user's namespace name 6 | spec: 7 | owner: 8 | kind: User 9 | name: aicomputing9@etri.re.kr 10 | #replace with the email of the user 11 | 12 | resourceQuotaSpec: 13 | #resource quota can be set optionally 14 | # hard: 15 | # cpu: "2" 16 | # memory: 2 Gi 17 | # requests.nvidia.com / gpu: "1" 18 | # persistentvolumeclaims: "1" 19 | # requests.storage: "5Gi" 20 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/docker/Dockerfile.org: -------------------------------------------------------------------------------- 1 | #FROM public.ecr.aws/j1r0q0g6/notebooks/notebook-servers/jupyter-tensorflow-cuda-full:v1.5.0 2 | FROM public.ecr.aws/j1r0q0g6/notebooks/notebook-servers/jupyter-pytorch-cuda-full:v1.5.0 3 | USER root 4 | ENV NB_USER=jovyan 5 | 6 | # Nvidia GPG Public Key 교체 7 | #RUN rm /etc/apt/sources.list.d/cuda.list \ 8 | # && rm /etc/apt/sources.list.d/nvidia-ml.list 9 | 10 | RUN apt-get update && apt-get install -y --no-install-recommends \ 11 | sudo \ 12 | apt-utils \ 13 | && usermod -aG sudo ${NB_USER} \ 14 | && echo ${NB_USER}:${NB_USER} | chpasswd \ 15 | && echo "${NB_USER} ALL=(root) NOPASSWD:SETENV: /init" >> /etc/sudoers 16 | 17 | # install - requirements.txt 18 | # COPY requirements.txt requirements.txt 19 | # RUN pip3 install -r requirements.txt 20 | 21 | USER $NB_USER 22 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/docker/cuda-requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/torch_stable.html 2 | torch==1.8.1+cu111 3 | torchvision==0.9.1+cu111 4 | torchaudio==0.8.1 5 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/docker/make_dockerimage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | cp Dockerfile.scratch Dockerfile 5 | docker build --no-cache -t swsok/nvidia-pytorch-kubeflow:v1 . 6 | docker login --username swsok --password etri-aicomputing 7 | docker push swsok/nvidia-pytorch-kubeflow:v1 8 | 9 | #cp Dockerfile.org Dockerfile 10 | #docker build --no-cache -t swsok/jupyter-pytorch-cuda-full-sudo:v1.5.0 . 11 | #docker login --username swsok --password etri-aicomputing 12 | #docker push swsok/jupyter-pytorch-cuda-full-sudo:v1.5.0 13 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/docker/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | jupyterlab==3.4.3 3 | notebook==6.4.12 4 | ipykernel==6.15.0 5 | # kubeflow packages 6 | kfp==1.6.3 7 | kfp-server-api==1.6.0 8 | kfserving==0.5.1 9 | 10 | # common packages 11 | bokeh==2.3.2 12 | cloudpickle==1.6.0 13 | dill==0.3.4 14 | ipympl==0.7.0 15 | ipywidgets==7.6.3 16 | jupyterlab-git==0.30.1 17 | matplotlib==3.4.2 18 | pandas==1.2.4 19 | scikit-image==0.18.1 20 | scikit-learn==0.24.2 21 | scipy==1.7.0 22 | seaborn==0.11.1 23 | xgboost==1.4.2 24 | 25 | # pytorch packages 26 | #torchelastic==0.2.2 this currently causes a dependency conflict, should be fixed very soon 27 | fastai==2.4 28 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/docker/s6/cont-init.d/01-copy-tmp-home: -------------------------------------------------------------------------------- 1 | #!/usr/bin/with-contenv bash 2 | cp -r -n /tmp_home/* /home/ 3 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/docker/s6/services.d/jupyterlab/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/with-contenv bash 2 | cd "${HOME}" 3 | exec /opt/conda/bin/jupyter lab \ 4 | --notebook-dir="${HOME}" \ 5 | --ip=0.0.0.0 \ 6 | --no-browser \ 7 | --allow-root \ 8 | --port=8888 \ 9 | --ServerApp.token="" \ 10 | --ServerApp.password="" \ 11 | --ServerApp.allow_origin="*" \ 12 | --ServerApp.base_url="${NB_PREFIX}" \ 13 | --ServerApp.authenticate_prometheus=False 14 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/setup_for_gpu_node_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ORG_DIR=$PWD 4 | PROGRESS_FILE="$PWD/progress.stat" 5 | 6 | chmod a+x common/*.sh 7 | 8 | # create progress stat file 9 | STAGE=$(<$PROGRESS_FILE) 10 | 11 | cd common 12 | 13 | if ! [ -e $PROGRESS_FILE ]; then 14 | ./00-prepare-nodes.sh 15 | echo "0" > $PROGRESS_FILE 16 | STAGE=0 17 | fi 18 | 19 | # installing CUDNN and Nvidia-driver 20 | if (( $STAGE < 1 )); then 21 | ./01-install-cudnn-and-nvidia-driver.sh 22 | echo "1" > $PROGRESS_FILE 23 | sudo reboot 24 | fi 25 | 26 | # installing docker 27 | if (( $STAGE < 2 )); then 28 | ./02-install-docker.sh 29 | echo "2" > $PROGRESS_FILE 30 | sudo reboot 31 | fi 32 | 33 | # installing nvidia docker - for testing docker and gpus 34 | if (( $STAGE < 3 )); then 35 | ./03-install-nvidia-docker.sh 36 | echo "3" > $PROGRESS_FILE 37 | fi 38 | 39 | # installing Kubernetes 40 | if (( $STAGE < 4 )); then 41 | ./04-install-k8s.sh 42 | echo "4" > $PROGRESS_FILE 43 | sudo reboot 44 | fi 45 | 46 | # configuring Kubernetes 47 | if (( $STAGE < 5 )); then 48 | ./05-init-k8s-master-only.sh 49 | echo "5" > $PROGRESS_FILE 50 | sudo reboot 51 | fi 52 | 53 | cd $ORG_DIR 54 | -------------------------------------------------------------------------------- /k8s_kubeflow_install/setup_for_gpu_node_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ORG_DIR=$PWD 4 | PROGRESS_FILE="$PWD/worker_progress.stat" 5 | 6 | chmod a+x common/*.sh 7 | 8 | # create progress stat file 9 | STAGE=$(<$PROGRESS_FILE) 10 | 11 | cd common 12 | 13 | if ! [ -e $PROGRESS_FILE ]; then 14 | ./00-prepare-nodes.sh 15 | echo "0" > $PROGRESS_FILE 16 | STAGE=0 17 | fi 18 | 19 | # installing CUDNN and Nvidia-driver 20 | if (( $STAGE < 1 )); then 21 | ./01-install-cudnn-and-nvidia-driver.sh 22 | echo "1" > $PROGRESS_FILE 23 | sudo reboot 24 | fi 25 | 26 | # installing docker 27 | if (( $STAGE < 2 )); then 28 | ./02-install-docker.sh 29 | echo "2" > $PROGRESS_FILE 30 | sudo reboot 31 | fi 32 | 33 | # installing nvidia docker - for testing docker and gpus 34 | if (( $STAGE < 3 )); then 35 | ./03-install-nvidia-docker.sh 36 | echo "3" > $PROGRESS_FILE 37 | fi 38 | 39 | # installing Kubernetes 40 | if (( $STAGE < 4 )); then 41 | ./04-install-k8s.sh 42 | echo "4" > $PROGRESS_FILE 43 | sudo reboot 44 | fi 45 | 46 | cd $ORG_DIR 47 | -------------------------------------------------------------------------------- /llama3_inference/README.md: -------------------------------------------------------------------------------- 1 | # Llama3 8B inference examples 2 | 3 | ### Basic version 4 | 5 | python3 llama3_inference_basic.py 6 | 7 | ### Memory offload version (to free up GPU memory space, some layers are swapped with host memory) 8 | 9 | python3 llama3_inference_memory_offload.py 10 | 11 | ### Required python packages 12 | 13 | pip3 install torch huggingface_hub transformers datasets bitsandbytes gradio pypdf accelerate 14 | 15 | A Gradio-based web UI is provided, and the default configuration allows access at 127.0.0.1:7860. 16 | 17 | We recommend using a GPU with more than 8GB of memory. 18 | 19 | ## License 20 | 21 | The results of the AIcomp project are distributed under the 3-clause BSD license. 22 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #To get the latest APEX 16 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.09-py3 17 | FROM ${FROM_IMAGE_NAME} 18 | 19 | # Install dependencies 20 | RUN apt-get update \ 21 | && apt-get install -y --no-install-recommends \ 22 | bzip2 \ 23 | cabextract \ 24 | iputils-ping \ 25 | pbzip2 \ 26 | pv \ 27 | && rm -rf /var/lib/apt/lists/* 28 | 29 | WORKDIR /workspace/bert 30 | COPY requirements.txt . 31 | RUN pip install --no-cache-dir -r requirements.txt 32 | #swsok, To enable download big files from google 33 | RUN pip install -U --no-cache-dir gdown --pre 34 | 35 | # Preprocessing 36 | WORKDIR /workspace 37 | RUN cd /workspace && git clone https://github.com/attardi/wikiextractor.git 38 | RUN cd /workspace/wikiextractor && git checkout e4abb4cbd019b0257824ee47c23dd163919b731b 39 | 40 | # Install BERT 41 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data 42 | WORKDIR /workspace/bert 43 | COPY . . 44 | 45 | ENV PYTHONPATH "/workspace/bert" 46 | 47 | RUN cd /workspace/bert/mhalib && python setup.py build && cp build/lib*/mhalib* ../ 48 | WORKDIR /workspace/bert 49 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/NOTICE: -------------------------------------------------------------------------------- 1 | BERT PyTorch 2 | 3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT 4 | licensed under the Apache License 2.0. 5 | 6 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/README_2xa30_ngc22.09_pytorch.md: -------------------------------------------------------------------------------- 1 | ## Steps to launch training on a single node with 2xA30 2 | 3 | ### NVIDIA DGX single node 4 | Launch configuration and system-specific hyperparameters for the NVIDIA A30 5 | multi node submission are in the following scripts: 6 | * for the 2xA30 1-node NVIDIA submission: `config_A30_1x2x224x14.sh` 7 | 8 | Steps required to launch multi node training on NVIDIA 2xA30: 9 | 10 | 1. Build the container: 11 | 12 | ``` 13 | docker build --pull -t /mlperf-nvidia:language_model . 14 | docker push /mlperf-nvidia:language_model 15 | ``` 16 | 17 | 2. Launch the training: 18 | 19 | 1-node NVIDIA 2xA30 training: 20 | 21 | ``` 22 | source config_A30_1x2x224x14.sh 23 | CONT=mlperf-nvidia:language_model DATADIR= DATADIR_PHASE2= EVALDIR= CHECKPOINTDIR= CHECKPOINTDIR_PHASE1=/mlperf-nvidia:language_model . 14 | docker push /mlperf-nvidia:language_model 15 | ``` 16 | 17 | 2. Launch the training: 18 | 19 | 512-node NVIDIA DGX A100 training: 20 | 21 | ``` 22 | source config_DGXA100_512x8x2x1_pack.sh 23 | CONT=mlperf-nvidia:language_model DATADIR= DATADIR_PHASE2= EVALDIR= CHECKPOINTDIR= CHECKPOINTDIR_PHASE1=/mlperf-nvidia:language_model . 14 | docker push /mlperf-nvidia:language_model 15 | ``` 16 | 17 | 2. Launch the training: 18 | 19 | 8-node NVIDIA DGX A100 training: 20 | 21 | ``` 22 | source config_DGXA100_8x8x48x1.sh 23 | CONT=mlperf-nvidia:language_model DATADIR= DATADIR_PHASE2= EVALDIR= CHECKPOINTDIR= CHECKPOINTDIR_PHASE1=/mlperf-nvidia:language_model . 14 | docker push /mlperf-nvidia:language_model 15 | ``` 16 | 17 | 2. Launch the training: 18 | 19 | 1-node NVIDIA DGX A100 training: 20 | 21 | ``` 22 | source config_DGXA100_1x8x56x1.sh 23 | CONT=mlperf-nvidia:language_model DATADIR= DATADIR_PHASE2= EVALDIR= CHECKPOINTDIR= CHECKPOINTDIR_PHASE1= 0) 34 | curr_real_mask = np.sum(inputs[3][i, :] > 0) 35 | real_tokens[idx] = curr_real_tokens 36 | real_mask[idx] = curr_real_mask 37 | idx += 1 38 | 39 | n_samples += n_samples_shard 40 | 41 | hfile.close() 42 | 43 | print('n_samples:,', n_samples) 44 | print('n_tokens_per_seq:', n_tokens_per_seq) 45 | print('n_mask_per_seq:', n_mask_per_seq) 46 | print('total n_pad_tokens:', np.sum(n_tokens_per_seq - real_tokens[:n_samples])) 47 | print('total n_pad_mask_tokens:', np.sum(n_mask_per_seq - real_mask[:n_samples])) 48 | print('mean pad tokens per seq:', np.mean(real_tokens[:n_samples])) 49 | print('mean pad masks per seq:', np.mean(real_mask[:n_samples])) 50 | 51 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/cleanup_scripts/parallel_create_hdf5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cpus=$( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w ) 16 | cpus=$((cpus / 2)) 17 | echo "Using $cpus CPU cores" 18 | 19 | mkdir -p hdf5/ 20 | find -L results4/ -name "part*" | xargs --max-args=1 --max-procs=$cpus ./create_pretraining_data_wrapper.sh 21 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/cleanup_scripts/process_wiki.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 MLBenchmark Group. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | # invocation script to cleanup the wiki dataset 18 | # Usage: ./process_wiki.sh 19 | # example: ./process_wiki.sh 'sample_data/wiki_??' 20 | # The resulted files will be placed in ./results 21 | 22 | inputs=$1 23 | 24 | pip install nltk 25 | 26 | # Remove doc tag and title 27 | python ./cleanup_file.py --data=$inputs --output_suffix='.1' 28 | 29 | # Further clean up files 30 | for f in ${inputs}; do 31 | ./clean.sh ${f}.1 ${f}.2 32 | done 33 | 34 | # Sentence segmentation 35 | python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3' 36 | 37 | mkdir -p ./results 38 | 39 | ## Choose file size method or number of packages by uncommenting only one of the following do_gather options 40 | # Gather into fixed size packages 41 | python ./do_gather.py --data=$inputs --input_suffix='.3' --block_size=26.92 --out_dir='./results' 42 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/cleanup_scripts/transparency_in_test_set_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import glob 15 | 16 | output_filename = 'wiki_test_set.txt' 17 | 18 | test_articles = [] 19 | 20 | file_glob = glob.glob('./results/part*', recursive=False) 21 | 22 | with open(output_filename, mode='w', newline='\n') as ofile: 23 | for filename in file_glob: 24 | articles_in_file = [] 25 | with open(filename, mode='r', newline='\n') as ifile: 26 | lines = ifile.read() 27 | articles_in_file_tmp = lines.split('\n\n') 28 | articles_in_file = [] 29 | for item in articles_in_file_tmp: 30 | if item.rstrip() != '': 31 | articles_in_file.append(item) 32 | 33 | target_article = min(42, len(articles_in_file) // 2) 34 | test_articles.append(articles_in_file[target_article]) 35 | 36 | with open(filename, mode='w', newline='\n') as ifile: 37 | for article in articles_in_file[:target_article]: 38 | ifile.write(article) 39 | ifile.write('\n\n') 40 | 41 | for article in articles_in_file[target_article+1:]: 42 | ifile.write(article) 43 | ifile.write('\n\n') 44 | 45 | for article in test_articles: 46 | ofile.write(article) 47 | ofile.write('\n\n') 48 | 49 | print("n_articles =", len(test_articles)) 50 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_A30_1x2x224x14.sh: -------------------------------------------------------------------------------- 1 | ## DL params 2 | export BATCHSIZE=224 3 | export GRADIENT_STEPS=14 4 | export LR=3.7e-4 5 | export MAX_SAMPLES_TERMINATION=20000000 6 | export MAX_STEPS=7100 7 | export OPT_LAMB_BETA_1=0.9 8 | export OPT_LAMB_BETA_2=0.999 9 | export START_WARMUP_STEP=0 10 | export WARMUP_PROPORTION=0.0 11 | 12 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --dwu-group-size=2 --fused_bias_fc --fused_bias_mha --fused_dropout_add " 13 | export PHASE=2 14 | export EVAL_ITER_START_SAMPLES=150000 15 | export EVAL_ITER_SAMPLES=150000 16 | 17 | ## System run parms 18 | export DGXNNODES=1 19 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 20 | export WALLTIME=04:00:00 21 | 22 | ## System config params 23 | export DGXNGPU=2 24 | export DGXSOCKETCORES=64 25 | export DGXNSOCKET=2 26 | export DGXHT=2 # HT is on is 2, HT off is 1 27 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_A40_1x2x224x14.sh: -------------------------------------------------------------------------------- 1 | ## DL params 2 | export BATCHSIZE=224 3 | export GRADIENT_STEPS=14 4 | export LR=3.7e-4 5 | export MAX_SAMPLES_TERMINATION=20000000 6 | export MAX_STEPS=7100 7 | export OPT_LAMB_BETA_1=0.9 8 | export OPT_LAMB_BETA_2=0.999 9 | export START_WARMUP_STEP=0 10 | export WARMUP_PROPORTION=0.0 11 | 12 | #export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --dwu-group-size=2 --fused_bias_fc --fused_bias_mha --fused_dropout_add " 13 | export EXTRA_PARAMS="--dense_seq_output --unpad --exchange_padding --dwu-group-size=2 --fused_bias_fc --fused_dropout_add " 14 | export PHASE=2 15 | export EVAL_ITER_START_SAMPLES=150000 16 | export EVAL_ITER_SAMPLES=150000 17 | 18 | ## System run parms 19 | export DGXNNODES=1 20 | #export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 21 | export DGXSYSTEM="A40_1x2x224x14" 22 | export WALLTIME=04:00:00 23 | 24 | ## System config params 25 | export DGXNGPU=2 26 | export DGXSOCKETCORES=8 27 | export DGXNSOCKET=1 28 | export DGXHT=2 # HT is on is 2, HT off is 1 29 | 30 | export CONT=swsok/mlperf-nvidia:language_model 31 | export DATADIR="/home/swosok/mlperf/bert/hdf5/training-4320/hdf5_4320_shards_varlength" 32 | export DATADIR_PHASE2="/home/swsok/mlperf/bert/hdf5/training-4320/hdf5_4320_shards_varlength" 33 | export EVALDIR="/home/swsok/mlperf/bert/hdf5/eval_varlength" 34 | export CHECKPOINTDIR_PHASE1="/home/swsok/mlperf/bert/phase1" 35 | export CHECKPOINTDIR="/home/swsok/mlperf/bert/checkpoints" 36 | export CUDA_VISIBLE_DEVICES="0,1" 37 | export NEXP=1 38 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_DGXA100_1x4x56x2.sh: -------------------------------------------------------------------------------- 1 | ## DL params 2 | export BATCHSIZE=112 3 | export GRADIENT_STEPS=2 4 | export LR=3.5e-4 5 | export MAX_SAMPLES_TERMINATION=4500000 6 | export MAX_STEPS=8041 7 | export OPT_LAMB_BETA_1=0.9 8 | export OPT_LAMB_BETA_2=0.999 9 | export START_WARMUP_STEP=0 10 | export WARMUP_PROPORTION=0.0 11 | 12 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --dwu-group-size=4 " 13 | export PHASE=2 14 | export EVAL_ITER_START_SAMPLES=150000 15 | export EVAL_ITER_SAMPLES=150000 16 | 17 | ## System run parms 18 | export DGXNNODES=1 19 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 20 | export WALLTIME=01:15:00 21 | 22 | ## System config params 23 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_4gpu_common.sh 24 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_DGXA100_1x8x56x1.sh: -------------------------------------------------------------------------------- 1 | ## DL params 2 | export BATCHSIZE=56 3 | export GRADIENT_STEPS=1 4 | export LR=0.000425 5 | export MAX_SAMPLES_TERMINATION=4500000 6 | export MAX_STEPS=6700 7 | export OPT_LAMB_BETA_1=0.9 8 | export OPT_LAMB_BETA_2=0.999 9 | export START_WARMUP_STEP=0 10 | export WARMUP_PROPORTION=0.0 11 | export WEIGHT_DECAY_RATE=0.01 12 | export INIT_LOSS_SCALE=1024.0 13 | 14 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --fused_bias_fc --fused_bias_mha --fused_dropout_add --fused_gemm_gelu " 15 | export PHASE=2 16 | export EVAL_ITER_START_SAMPLES=150000 17 | export EVAL_ITER_SAMPLES=150000 18 | 19 | ## System run parms 20 | export DGXNNODES=1 21 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 22 | export WALLTIME_MINUTES=23 23 | export WALLTIME=$(( ${NEXP:-1} * ${WALLTIME_MINUTES} + 5 )) 24 | 25 | ## System config params 26 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_common.sh 27 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_DGXA100_4gpu_common.sh: -------------------------------------------------------------------------------- 1 | ## System config params 2 | export DGXNGPU=4 3 | export DGXSOCKETCORES=64 4 | export DGXNSOCKET=2 5 | export DGXHT=2 # HT is on is 2, HT off is 1 6 | export SLURM_NTASKS=${DGXNGPU} 7 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 8 | 9 | ## Data Paths 10 | export DATADIR="/raid/datasets/bert/hdf5/4320_shards" 11 | export EVALDIR="/raid/datasets/bert/hdf5/eval_4320_shard" 12 | export DATADIR_PHASE2="/raid/datasets/bert/hdf5/4320_shards" 13 | export CHECKPOINTDIR="$CI_BUILDS_DIR/$SLURM_ACCOUNT/$CI_JOB_ID/ci_checkpoints" 14 | export RESULTSDIR="$CI_BUILDS_DIR/$SLURM_ACCOUNT/$CI_JOB_ID/results" 15 | #using existing checkpoint_phase1 dir 16 | export CHECKPOINTDIR_PHASE1="/raid/datasets/bert/checkpoints/checkpoint_phase1" 17 | export UNITTESTDIR="/lustre/fsw/mlperf/mlperft-bert/unit_test" 18 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_DGXA100_512x8x2x1_pack.sh: -------------------------------------------------------------------------------- 1 | ## DL params 2 | export BATCHSIZE=2 3 | export GRADIENT_STEPS=1 4 | export PACKING_FACTOR=2 5 | export INIT_LOSS_SCALE=128.0 6 | export LR=0.0033 7 | export MAX_SAMPLES_TERMINATION=12000000 8 | export MAX_STEPS=470 9 | export OPT_LAMB_BETA_1=0.75 10 | export OPT_LAMB_BETA_2=0.9 11 | export START_WARMUP_STEP=-100 12 | export WEIGHT_DECAY_RATE=0.0166629 13 | export WARMUP_STEPS=290 14 | export SBATCH_NETWORK=sharp 15 | export NCCL_GRAPH_REGISTER=1 16 | export EXTRA_PARAMS="--use_cuda_graph --pad_fmha --cuda_graph_mode 'full_iteration' --max_iterations_per_graph 1 --fused_bias_fc --fused_bias_mha --fused_dropout_add --fused_bias_fc_loss_head --packed_samples " 17 | export PHASE=2 18 | 19 | ## System run parms 20 | export DGXNNODES=512 21 | 22 | # hparams that depend on number of nodes 23 | export EVAL_ITER_START_SAMPLES=325000 #$(echo "25000*(0.05*(230.23*${BATCHSIZE}*${DGXNNODES}*8*${PACKING_FACTOR}+3000000)/25000)" | bc) 24 | export EVAL_ITER_SAMPLES=${EVAL_ITER_START_SAMPLES} 25 | 26 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 27 | export WALLTIME_MINUTES=7 28 | 29 | export WALLTIME=$(( ${NEXP:-1} * ${WALLTIME_MINUTES} + 5 )) 30 | 31 | ## System config params 32 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_common.sh 33 | export DATADIR_PHASE2="/raid/datasets/bert/hdf5/4320_packed_shards" 34 | 35 | export CONTAINER_PRELOAD_LUSTRE=1 36 | export USE_DDP=1 37 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_DGXA100_8x8x48x1.sh: -------------------------------------------------------------------------------- 1 | ## DL params 2 | export BATCHSIZE=48 3 | export GRADIENT_STEPS=1 4 | export LR=0.0020992 5 | export MAX_SAMPLES_TERMINATION=4500000 6 | export MAX_STEPS=1059 7 | export OPT_LAMB_BETA_1=0.60466 8 | export OPT_LAMB_BETA_2=0.85437 9 | export START_WARMUP_STEP=0 10 | export WARMUP_STEPS=0 11 | export WEIGHT_DECAY_RATE=0.1 12 | export INIT_LOSS_SCALE=4096.0 13 | 14 | export SBATCH_NETWORK=sharp 15 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --fused_bias_fc --fused_bias_mha --fused_dropout_add --fused_gemm_gelu " 16 | export PHASE=2 17 | export EVAL_ITER_START_SAMPLES=175000 18 | export EVAL_ITER_SAMPLES=175000 19 | 20 | ## System run parms 21 | export DGXNNODES=8 22 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 23 | export WALLTIME_MINUTES=15 24 | export WALLTIME=$(( ${NEXP:-1} * ${WALLTIME_MINUTES} + 5 )) 25 | 26 | ## System config params 27 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_common.sh 28 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/config_DGXA100_common.sh: -------------------------------------------------------------------------------- 1 | ## System config params 2 | export DGXNGPU=8 3 | export DGXSOCKETCORES=64 4 | export DGXNSOCKET=2 5 | export DGXHT=2 # HT is on is 2, HT off is 1 6 | export SLURM_NTASKS=${DGXNGPU} 7 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/create_pretraining_data_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" 16 | 17 | INPUT=${1} 18 | OUTPUT=${2}/$(basename $INPUT) 19 | VOCAB=${3} 20 | 21 | python3 ${SCRIPT_DIR}/create_pretraining_data.py \ 22 | --input_file=${INPUT} \ 23 | --output_file=${OUTPUT} \ 24 | --vocab_file=${VOCAB} \ 25 | --do_lower_case=True \ 26 | --max_seq_length=512 \ 27 | --max_predictions_per_seq=76 \ 28 | --masked_lm_prob=0.15 \ 29 | --random_seed=12345 \ 30 | --dupe_factor=10 31 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/do_sentence_segmentation.py: -------------------------------------------------------------------------------- 1 | """Script for sentence segmentation. 2 | 3 | Copied and modified from https://github.com/eric-haibin-lin/text-proc.git 4 | """ 5 | import argparse 6 | import glob 7 | import io 8 | import logging 9 | import multiprocessing 10 | import os 11 | import time 12 | import nltk 13 | 14 | from nltk.tokenize import sent_tokenize 15 | 16 | parser = argparse.ArgumentParser( 17 | description='Sentence segmentation for BERT documents.') 18 | parser.add_argument( 19 | '--data', 20 | type=str, 21 | default='./*/*.compact', 22 | help='Input files. Default is "./*/*.compact"') 23 | parser.add_argument( 24 | '--input_suffix', 25 | type=str, 26 | default='.2', 27 | help='Suffix for input files. Default is ".2"') 28 | parser.add_argument( 29 | '--output_suffix', 30 | type=str, 31 | default='.3', 32 | help='Suffix for output files. Default is ".3"') 33 | parser.add_argument( 34 | '--nworker', 35 | type=int, 36 | default=72, 37 | help='Number of workers for parallel processing.') 38 | args = parser.parse_args() 39 | 40 | # download package 41 | nltk.download('punkt') 42 | 43 | # arguments 44 | input_files = sorted(glob.glob(os.path.expanduser(args.data))) 45 | num_files = len(input_files) 46 | num_workers = args.nworker 47 | logging.basicConfig(level=logging.INFO) 48 | logging.info('Number of input files to process = %d', num_files) 49 | 50 | 51 | def process_one_file(one_input): 52 | """Separate paragraphs into sentences, for one file.""" 53 | input_filename = one_input + args.input_suffix 54 | output_filename = one_input + args.output_suffix 55 | logging.info('Processing %s => %s', input_filename, output_filename) 56 | with io.open(input_filename, 'r', encoding='utf-8') as fin: 57 | with io.open(output_filename, 'w', encoding='utf-8') as fout: 58 | for line in fin: 59 | if len(line) == 1: 60 | fout.write(u'\n') 61 | sents = sent_tokenize(line) 62 | for sent in sents: 63 | sent_str = sent.strip() 64 | # if sent_str: 65 | fout.write('%s\n' % sent_str) 66 | fout.write(u'\n') 67 | 68 | 69 | if __name__ == '__main__': 70 | tic = time.time() 71 | p = multiprocessing.Pool(num_workers) 72 | p.map(process_one_file, input_files) 73 | toc = time.time() 74 | logging.info('Processed %s in %.2f sec', args.data, toc - tic) 75 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/eval_varlength.chk: -------------------------------------------------------------------------------- 1 | part_eval_10k.hdf5 611d8bae26646145e1c33338a27ba124 2 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/hdf5_md5.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import h5py 3 | import numpy as np 4 | import hashlib 5 | import os 6 | 7 | # Exmaple usage: 8 | # python3 tfrecord_md5sum.py --input_tfrecord=eval_10k --output_md5sum=eval_shard.md5 9 | 10 | parser = argparse.ArgumentParser( 11 | description="HDF5 variable length to MD5sums for BERT.") 12 | parser.add_argument( 13 | '--input_hdf5', 14 | type=str, 15 | required=True, 16 | help='Input tfrecord path') 17 | args = parser.parse_args() 18 | 19 | 20 | if __name__ == '__main__': 21 | 22 | h = hashlib.md5 23 | 24 | row_sums=[] 25 | f = h5py.File(args.input_hdf5, 'r') 26 | for i in range(f['input_ids'].shape[0]): 27 | row_sums.append(h(str(f['input_ids'][i].tolist()).encode('utf-8')).hexdigest()) 28 | f.close() 29 | print("{}\t{}".format(os.path.basename(args.input_hdf5), h(str(row_sums).encode('utf-8')).hexdigest())) -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/packed_data/README.md: -------------------------------------------------------------------------------- 1 | # Download and prepare the data 2 | 3 | Building the Docker container 4 | ```shell 5 | docker build --pull -t /mlperf-nvidia:language_model . 6 | docker push /mlperf-nvidia:language_model 7 | ``` 8 | 9 | Go through standard data preparation upt to the moment where you have unpacked "results4" sources 10 | 11 | Assuming /data/mlperf/bert/ contains 'results4' and 'phase1' directories 12 | 13 | Start the container interactively, mounting the directory you want to store the expieriment data as `/workspace/bert_data` 14 | ``` 15 | docker run -it --runtime=nvidia --ipc=host (...) -v /data/mlperf/bert:/workspace/bert_data mlperf-nvidia:language_model 16 | ``` 17 | 18 | To prepare the packed version of data, we need first to group the trainig sequences be lenght (as number of valid tokens). To easily parallelize the process each shard is processed separately and at the end results are merged. 19 | ``` 20 | mkdir -p /workspace/bert_data/per_seqlen_parts 21 | for shard in `seq -w 00000 00499`; do 22 | mkdir -p /workspace/bert_data/per_seqlen_parts/part-${shard} 23 | done 24 | ``` 25 | 26 | Parallelize over $CPUS cores 27 | ``` 28 | CPUS=64 29 | seq -w 00000 00499 | xargs --max-args=1 --max-procs=$CPUS -I{} python create_per_seqlength_data.py --input_file ../download/results4/part-{}-of-00500 --output_file ./per_seqlen/part_{} --vocab_file ../phase1/vocab.txt --do_lower_case=True --max_seq_length=512 --max_predictions_per_seq=76 --masked_lm_prob=0.15 --random_seed=12345 --dupe_factor=10 30 | ``` 31 | 32 | Merge all results 33 | ``` 34 | mkdir -p /workspace/bert_data/per_seqlen 35 | seq 0 511 | xargs --max-args=1 --max-procs=$CPUS -I{} python ./gather_per_seqlength_data.py --input_hdf5 /workspace/bert_data/per_seqlen_parts --output_hdf5 /workspace/bert_data/per_seqlen --seq_length ${} 36 | ``` 37 | 38 | Generate sub-optimal packing strategy based on lenghts distribution of training set and store samples-based lists per shard 39 | ``` 40 | python ./generate_packing_strategy.py --input_hdf5 /workspace/bert_data/per_seqlen --output_hdf5 /workspace/bert_data/packed_data --max_seq_length 512 --max_seq_per_sample 3 --shards_num 4320 41 | ``` 42 | 43 | Create training set shards based on generated lists 44 | ``` 45 | python create_packed_trainset.py --input_hdf5 /workspace/bert_data/per_seqlen --assignment_file /workspace/bert_data/packed_data --output_hdf5 /workspace/bert_data/packed_data 46 | ``` 47 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/process_wiki.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # invocation script to cleanup the wiki dataset 4 | # Usage: ./process_wiki.sh 5 | # example: ./process_wiki.sh 'sample_data/wiki_??' 6 | # The resulted files will be placed in ./results 7 | 8 | inputs=$1 9 | 10 | pip install nltk 11 | 12 | # Remove doc tag and title 13 | # python ./cleanup_file.py --data=$inputs --output_suffix='.1' 14 | 15 | # Further clean up files 16 | # for f in ${inputs}; do 17 | # ./clean.sh ${f}.1 ${f}.2 18 | # done 19 | 20 | # Sentence segmentation 21 | # python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3' 22 | 23 | mkdir -p ./results 24 | 25 | # Train/Eval seperation 26 | python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output='./results/eval' 27 | 28 | ## Choose file size method or number of packages by uncommenting only one of the following do_gather options 29 | # Gather into fixed size packages 30 | python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir='./results' 31 | 32 | # Gather into fixed number of packages 33 | #NUM_PACKAGES=512 34 | #python ./do_gather.py --data=$inputs --input_suffix='.3' --num_outputs=$NUM_PACKAGES --out_dir='./results' 35 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/input_preprocessing/shuffle_samples.py: -------------------------------------------------------------------------------- 1 | from typing import OrderedDict 2 | import h5py 3 | import numpy as np 4 | import argparse 5 | import logging 6 | from tqdm import tqdm 7 | from itertools import repeat, cycle 8 | import json 9 | import glob 10 | import random 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | parser = argparse.ArgumentParser( 14 | description="Training data sharding for BERT.") 15 | parser.add_argument( 16 | '--input_hdf5', 17 | type=str, 18 | default='hdf5', 19 | help='Input hdf5_file path') 20 | parser.add_argument( 21 | '--output_hdf5', 22 | type=str, 23 | default='', 24 | help='Output hdf5_dir path') 25 | args = parser.parse_args() 26 | 27 | 28 | input_files = sorted(glob.glob(args.input_hdf5 + '/part_*.hdf5', recursive=False)) 29 | num_shards = len(input_files) 30 | logging.info('n_input_shards = {}'.format(num_shards)) 31 | 32 | ifile_handles={} 33 | for ifile_idx in tqdm(range(num_shards)): 34 | handle = h5py.File(f'{input_files[ifile_idx]}', 'r') 35 | print(handle.keys()) 36 | ifile_handles[ifile_idx] = [ 37 | handle['input_ids'][:], 38 | handle['input_mask'][:], 39 | handle['segment_ids'][:], 40 | handle['masked_lm_positions'][:], 41 | handle['masked_lm_ids'][:], 42 | handle['next_sentence_labels'][:] 43 | ] 44 | handle.close() 45 | 46 | ind=[(i, j) for idx in range(num_shards) for i, j in zip(cycle([idx]), list(range(ifile_handles[idx][0].shape[0]))) ] 47 | random.shuffle(ind) 48 | 49 | # dumps per shard sample indexes 50 | master_sample_idx = 0 51 | for ofile_idx in tqdm(range(num_shards)): 52 | n_samples_in_this_shard = ifile_handles[ofile_idx][0].shape[0] 53 | idxs=ind[master_sample_idx:master_sample_idx+n_samples_in_this_shard] 54 | with open(f'{args.output_hdf5}/shard_list_{ofile_idx:05}.lst','w') as f: 55 | f.write(json.dumps(idxs)) 56 | #swsok, this line is omitted. 57 | master_sample_idx = master_sample_idx+n_samples_in_this_shard 58 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/mhalib/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import setuptools 17 | from setuptools import setup 18 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 19 | 20 | setup( 21 | name='mhalib', 22 | ext_modules=[ 23 | CUDAExtension( 24 | name='mhalib', 25 | sources=['mha_funcs.cu'], 26 | extra_compile_args={ 27 | 'cxx': ['-O3',], 28 | 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode=arch=compute_70,code=sm_70','-gencode=arch=compute_80,code=sm_80','-gencode=arch=compute_80,code=compute_80'] 29 | } 30 | ) 31 | ], 32 | cmdclass={ 33 | 'build_ext': BuildExtension 34 | }) 35 | 36 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/mlperf_logger.py: -------------------------------------------------------------------------------- 1 | # import collections 2 | # import os 3 | # import subprocess 4 | 5 | import torch 6 | from mlperf_common.logging import MLLoggerWrapper 7 | from mlperf_common.frameworks.pyt import PyTCommunicationHandler 8 | 9 | mllogger = MLLoggerWrapper(PyTCommunicationHandler()) -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/__init__.py -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .activations import bias_gelu_impl 4 | 5 | __all__ = ["bias_gelu_impl"] 6 | 7 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/layers/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/layers/attention.py -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/layers/fused.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import mlp_cuda 3 | from torch import nn 4 | from apex import amp 5 | 6 | #implements fused GEMM+bias in forward pass using mlp_cuda from apex 7 | class FusedMlpFunction(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input, weight, bias): 10 | ctx.save_for_backward(input, weight) 11 | output = mlp_cuda.forward(True, 0, (input, weight, bias)) 12 | return output[0] 13 | 14 | @staticmethod 15 | def backward(ctx, grad_output): 16 | input, weight = ctx.saved_tensors 17 | grad_input = grad_output.mm(weight) 18 | grad_weight = grad_output.t().mm(input) 19 | grad_bias = torch.sum(grad_output, dim=0) 20 | return grad_input, grad_weight, grad_bias 21 | 22 | mlp_function = amp.half_function(FusedMlpFunction.apply) 23 | 24 | class FusedMlp(nn.Module): 25 | def __init__(self, in_features, out_features, bias=True): 26 | super(FusedMlp, self).__init__() 27 | self.in_features = in_features 28 | self.out_features = out_features 29 | self.weight = nn.Parameter(torch.Tensor(out_features, in_features)) 30 | if bias: 31 | self.bias = nn.Parameter(torch.Tensor(out_features)) 32 | else: 33 | self.register_parameter('bias', None) 34 | 35 | def forward(self, input): 36 | return mlp_function(input, self.weight, self.bias) 37 | 38 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/layers/layernorm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import torch 19 | from torch import nn 20 | 21 | try: 22 | import apex 23 | #apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm') 24 | import apex.normalization 25 | #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward') 26 | from apex.contrib.layer_norm import FastLayerNorm 27 | BertLayerNorm = FastLayerNorm 28 | except ImportError: 29 | print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") 30 | class BertLayerNorm(nn.Module): 31 | def __init__(self, hidden_size, eps=1e-12): 32 | """Construct a layernorm module in the TF style (epsilon inside the square root). 33 | """ 34 | super(BertLayerNorm, self).__init__() 35 | self.weight = nn.Parameter(torch.ones(hidden_size)) 36 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 37 | self.variance_epsilon = eps 38 | 39 | def forward(self, x): 40 | u = x.mean(-1, keepdim=True) 41 | s = (x - u).pow(2).mean(-1, keepdim=True) 42 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 43 | return self.weight * x + self.bias 44 | 45 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/losses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/losses/__init__.py -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/model/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/models/__init__.py -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/mounts.txt: -------------------------------------------------------------------------------- 1 | ${DATADIR}:/workspace/data 2 | ${DATADIR_PHASE2}:/workspace/data_phase2 3 | ${CHECKPOINTDIR_PHASE1}:/workspace/phase1 4 | ${EVALDIR}:/workspace/evaldata 5 | #${UNITTESTDIR}:/workspace/unit_test_data 6 | # 7 | ${PWD}/run_pretraining.py:/workspace/bert/run_pretraining.py 8 | ${PWD}/run_and_time.sh:/workspace/bert/run_and_time.sh 9 | ${CHECKPOINTDIR}:/workspace/checkpoints 10 | #${PWD}/bert_config_no-dp.json:/workspace/phase1/bert_config.json 11 | #${PWD}/modeling.py:/workspace/bert/modeling.py 12 | #${PWD}/fwd_loss_bwd_trainer.py:/workspace/bert/fwd_loss_bwd_trainer.py 13 | #${PWD}/fmha.py:/workspace/bert/fmha.py 14 | #${PWD}/distributed_fused_lamb.py:/opt/conda/lib/python3.8/site-packages/apex/contrib/optimizers/distributed_fused_lamb.py 15 | #${PWD}/fmhalib.cpython-38-x86_64-linux-gnu.so-cond:/opt/conda/lib/python3.8/site-packages/fmhalib.cpython-38-x86_64-linux-gnu.so 16 | 17 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | boto3==1.14.0 3 | gdown==4.4.0 4 | git+https://github.com/mlcommons/logging.git@2.1.0-rc1 5 | h5py==2.10.0 6 | html2text==2020.1.16 7 | ipdb==0.13.2 8 | nltk==3.5 9 | onnxruntime==1.3.0 10 | parameterized 11 | progressbar==2.5 12 | requests==2.23.0 13 | six==1.15.0 14 | tensorflow==2.2.0 15 | git+https://github.com/NVIDIA/mlperf-common.git@training-v2.1-rc0 16 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m torch.distributed.launch --nproc_per_node=8 \ 4 | -u /workspace/bert/run_pretraining.py \ 5 | --seed=42 \ 6 | --do_train \ 7 | --target_accuracy=0.714 \ 8 | --accuracy_score_averaging=1 \ 9 | --config_file=/workspace/phase1/bert_config.json \ 10 | --skip_checkpoint \ 11 | --output_dir=/results \ 12 | --fp16 \ 13 | --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \ 14 | --gradient_accumulation_steps=1 \ 15 | --log_freq=1 \ 16 | --train_batch_size=4 \ 17 | --learning_rate=4e-5 \ 18 | --warmup_proportion=1.0 \ 19 | --input_dir=/workspace/data_phase2 \ 20 | --phase2 \ 21 | --max_seq_length=512 \ 22 | --max_predictions_per_seq=76 \ 23 | --max_steps=100 \ 24 | --init_checkpoint=/workspace/phase1/model.ckpt-28252 \ 25 | 26 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/unit_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/unit_test/__init__.py -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/unit_test/global_vars.py: -------------------------------------------------------------------------------- 1 | # NVIDIA 2 | 3 | # The purpose of this module is to provide a space to allow global variables to work properly inside of the unittest framework 4 | 5 | # An example case is loading the TF weights file. 6 | ## It is ~4GB and read-only, so why not just load it once into a global variable 7 | 8 | tf_weights = None 9 | tf_tensors = None 10 | 11 | pyt_model = None 12 | pyt_checkpoint = None 13 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/unit_test/test_data_path.py: -------------------------------------------------------------------------------- 1 | # NVIDIA 2 | 3 | import os 4 | 5 | expected_data_path = '/workspace/unit_test_data' 6 | 7 | def data_path_found(): 8 | return os.path.isdir(expected_data_path) 9 | 10 | def get_path(): 11 | if data_path_found(): 12 | return expected_data_path 13 | else: 14 | raise ValueError('Unit test data not found - missing or not mounted correctly.') 15 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/unit_test/test_main.py: -------------------------------------------------------------------------------- 1 | # NVIDIA 2 | 3 | import unittest 4 | 5 | from test_bert_batch_1 import * 6 | #from test_bert_batch_7 import * 7 | from test_embeddings_batch_1 import * 8 | from test_encoders_batch_1 import * 9 | 10 | if __name__ == '__main__': 11 | unittest.main(verbosity=2) 12 | -------------------------------------------------------------------------------- /mlperf/pytorch-22.09/unit_test/unit_test_utils.py: -------------------------------------------------------------------------------- 1 | # NVIDIA 2 | 3 | import numpy as np 4 | 5 | # Fractions of max absolute difference 6 | bins_relative = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1] 7 | 8 | def max_abs_diff_binning(input_a, input_b): 9 | abs_diff = np.abs(input_a - input_b) 10 | max_abs_diff = np.max(abs_diff) 11 | max_idx = np.argmax(abs_diff) 12 | 13 | counts, bins_absolute = np.histogram(abs_diff, np.array(bins_relative) * max_abs_diff) 14 | return counts, bins_absolute, bins_relative, max_idx 15 | 16 | def pyt_tf_mapping(pyt_state_dict, add_prefix=''): 17 | pyt_strings = [x for x in pyt_state_dict.keys()] 18 | converted_strings = [add_prefix + x.replace('.', '/').replace('weight', 'kernel') for x in pyt_strings] 19 | 20 | for idx, item in enumerate(converted_strings): 21 | if 'LayerNorm' in item: 22 | item = item.replace('kernel', 'gamma').replace('bias', 'beta') 23 | elif 'embedding' in item: 24 | item = item.replace('/kernel', '') 25 | 26 | if 'layer/' in item: 27 | item = item.replace('layer/', 'layer_') 28 | 29 | if 'cls/' in item and 'decoder' not in item and 'dense' not in item: 30 | item = item.replace('bias', 'output_bias') 31 | item = item.replace('kernel', 'output_weights') 32 | 33 | if 'decoder' in item: 34 | item = item.replace('predictions/decoder', 'predictions/transform/dense') 35 | #item = item.replace('predictions/decoder', 'seq_relationship') 36 | # Add additional rules here 37 | 38 | converted_strings[idx] = item 39 | 40 | return dict(zip(pyt_state_dict.keys(), converted_strings)) 41 | -------------------------------------------------------------------------------- /opt_prime/opt_prime/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/opt_prime/opt_prime/__init__.py -------------------------------------------------------------------------------- /torchgpipe_OOO_PP/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022-present, ETRI, All rights reserved. 4 | 5 | From PyTorch: 6 | Copyright (c) 2014- Facebook, Inc, All rights reserved. 7 | 8 | From torchgpipe: 9 | Copyright (c) 2019-2020, Kakao Brain, All rights reserved. 10 | 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. Neither the name of the copyright holder nor the names of its 23 | contributors may be used to endorse or promote products derived from this 24 | software without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | POSSIBILITY OF SUCH DAMAGE. 37 | -------------------------------------------------------------------------------- /torchgpipe_OOO_PP/README.md: -------------------------------------------------------------------------------- 1 | ## Pipeline parallelization based on out of order technology 2 | 3 | We present the result of developing PoC that applies out of order technology (https://dl.acm.org/doi/pdf/10.1145/3492321.3519563) to pipeline parallelization. 4 | 5 | ## Usage 6 | 7 | This SW requires: 8 | * Python3 9 | * Pytorch 1.12+ 10 | * torchgpipe 0.0.7 11 | 12 | ## RUN 13 | 14 | python3 source.py 15 | 16 | ## License 17 | 18 | The results of the AIcomp project are distributed under the 3-clause BSD license. 19 | --------------------------------------------------------------------------------