├── LICENSE
├── README.md
├── compiler_fx
    ├── LICENSE
    ├── README.md
    ├── deep_gpt2-xl.py
    ├── deep_llama-8b.py
    ├── deepspeed_gpt2.py
    ├── deepspeed_pp_training.py
    ├── example_train_gpu.py
    ├── fx_dist_inference_type-A.py
    ├── fx_dist_inference_type-B.py
    ├── fx_dist_pp_dp_training_type-A_gpt2_gpu.py
    ├── fx_dist_pp_dp_training_type-A_gpu.py
    ├── fx_dist_pp_dp_training_type-A_gpu_validate.py
    ├── fx_dist_pp_training_type-A.py
    ├── fx_dist_pp_training_type-A_bert.py
    ├── fx_dist_pp_training_type-A_gpt2-medium.py
    ├── fx_dist_pp_training_type-A_gpt2.py
    ├── fx_dist_pp_training_type-A_gpt2_gpu.py
    ├── fx_dist_pp_training_type-A_gpt2_gpu_mopt1.py
    ├── fx_dist_pp_training_type-A_gpu.py
    ├── fx_dist_pp_training_type-A_gpu_measure-flops.py
    ├── fx_dist_pp_training_type-A_gpu_mopt1.py
    ├── fx_dist_pp_training_type-A_mopt1.py
    ├── fx_dist_pp_training_type-A_remove-fwdcache.py
    ├── fx_dist_pp_training_type-A_transformer.py
    ├── fx_dist_pp_training_type-B.py
    ├── fx_dist_pp_training_type-B_transformer.py
    ├── fx_dist_pp_training_type-C_bert_gpu.py
    ├── fx_dist_pp_training_type-C_gpt-neo_gpu.py
    ├── fx_dist_pp_training_type-C_gpt2-large_gpu.py
    ├── fx_dist_pp_training_type-C_gpt2-medium_gpu.py
    ├── fx_dist_pp_training_type-C_gpt2-xl_gpu.py
    ├── fx_dist_pp_training_type-C_gpt2_gpu.py
    ├── fx_dist_pp_training_type-C_gptj_gpu.py
    ├── fx_dist_pp_training_type-C_gpu.py
    ├── fx_dist_training_type-B.py
    ├── fx_dist_training_type-B_many-process.py
    ├── fx_inference.py
    ├── fx_inference_restructured.py
    ├── fx_ir_transfer.py
    ├── fx_split_graph.py
    ├── fx_split_range_traversal.py
    ├── fx_train.py
    ├── fx_train_extended.py
    ├── fx_train_with_backward_IR.py
    ├── fx_train_with_forwardonly_IR.py
    ├── fx_transformer.py
    ├── llama_2d.py
    ├── memory_usage.py
    ├── memory_usage2.py
    ├── memory_usage3.py
    ├── name_map_test.py
    ├── pippy-2_pp_training.py
    ├── pippy_pp_dp_training.py
    ├── pippy_pp_dp_training_gpt2.py
    ├── pippy_pp_training.py
    ├── pippy_pp_training_gpt2-large.py
    ├── pippy_pp_training_gpt2-medium.py
    ├── pippy_pp_training_gpt2-xl.py
    ├── pippy_pp_training_gpt2.py
    ├── py_comp.py
    ├── tf_comp.py
    ├── util_find_linear.py
    ├── util_setup_mesh.py
    ├── varuna_pp_dp_training.py
    ├── varuna_pp_training.py
    └── vit_train.py
├── gpt-neox
    ├── .clang-format
    ├── .dockerignore
    ├── .pre-commit-config.yaml
    ├── CITATION.cff
    ├── Dockerfile
    ├── LICENSE
    ├── MANIFEST.in
    ├── README-MUP.md
    ├── README.md
    ├── configs
    │   ├── 1.3B-32k-len-conf.yml
    │   ├── 125M-32k-len-conf.yml
    │   ├── 125M.yml
    │   ├── 2-7B.yml
    │   ├── 2.7B-32k-len-conf.yml
    │   ├── 20B.yml
    │   ├── 250M-32k-len-conf.yml
    │   ├── 6-7B.yml
    │   ├── 6.7B-32k-len-conf.yml
    │   ├── 6.7B-32k-len-conf.yml_org
    │   ├── 760M-32k-len-conf.yml
    │   ├── README.md
    │   ├── autotuning_configs
    │   │   ├── small_tune.json
    │   │   ├── tune.json
    │   │   ├── tune_1-3B.json
    │   │   └── tune_6-7B.json
    │   ├── enwik8.yml
    │   ├── etri_cluster.yml
    │   ├── finetuning_configs
    │   │   └── 6-9B.yml
    │   ├── gen_docs.py
    │   ├── llama
    │   │   ├── 13B.yml
    │   │   ├── 30B.yml
    │   │   ├── 65B.yml
    │   │   ├── 7B.yml
    │   │   ├── README.md
    │   │   └── train_config.yml
    │   ├── neox_arguments.md
    │   ├── org
    │   │   ├── 1-3B.yml
    │   │   ├── 125M-json.yml
    │   │   ├── 13B.yml
    │   │   ├── 175B.yml
    │   │   ├── 19M.yml
    │   │   ├── 250M.yml
    │   │   ├── 350M.yml
    │   │   ├── 49M.yml
    │   │   ├── 760M.yml
    │   │   ├── 800M.yml
    │   │   ├── bf16_125M.yml
    │   │   ├── bnb_125M.yml
    │   │   ├── cpu_mock_config.yml
    │   │   ├── eleutherai_cluster.yml
    │   │   ├── gmlp_small.yml
    │   │   ├── local_setup.yml
    │   │   ├── slurm_125M.yml
    │   │   ├── slurm_local.yml
    │   │   ├── sparse.yml
    │   │   ├── test.yml
    │   │   └── text_generation.yml
    │   ├── pile.yml
    │   ├── pythia
    │   │   ├── 1-4B.yml
    │   │   ├── 12B.yml
    │   │   ├── 160M.yml
    │   │   ├── 1B.yml
    │   │   ├── 2-8B.yml
    │   │   ├── 410M.yml
    │   │   ├── 6-9B.yml
    │   │   └── 70M.yml
    │   └── slurm_local.json
    ├── deepy.py
    ├── eval_tasks
    │   ├── __init__.py
    │   └── eval_adapter.py
    ├── evaluate.py
    ├── generate.py
    ├── hostfile
    ├── megatron
    │   ├── __init__.py
    │   ├── checkpointing.py
    │   ├── data
    │   │   ├── Makefile
    │   │   ├── __init__.py
    │   │   ├── blendable_dataset.py
    │   │   ├── data_utils.py
    │   │   ├── gpt2_dataset.py
    │   │   ├── helpers.cpp
    │   │   ├── helpers.cpython-38-x86_64-linux-gnu.so
    │   │   ├── indexed_dataset.py
    │   │   └── samplers.py
    │   ├── fused_kernels
    │   │   ├── __init__.py
    │   │   ├── compat.h
    │   │   ├── scaled_masked_softmax.cpp
    │   │   ├── scaled_masked_softmax.h
    │   │   ├── scaled_masked_softmax_cuda.cu
    │   │   ├── scaled_upper_triang_masked_softmax.cpp
    │   │   ├── scaled_upper_triang_masked_softmax.h
    │   │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   │   ├── setup.py
    │   │   ├── tests
    │   │   │   └── test_fused_kernels.py
    │   │   └── type_shim.h
    │   ├── gradient_noise_scale
    │   │   ├── __init__.py
    │   │   └── gradient_noise_scale.py
    │   ├── initialize.py
    │   ├── learning_rates.py
    │   ├── logging.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── activations.py
    │   │   ├── flash_attention.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_softmax.py
    │   │   ├── gmlp.py
    │   │   ├── gpt2_model.py
    │   │   ├── init_functions.py
    │   │   ├── norms.py
    │   │   ├── positional_embeddings.py
    │   │   ├── transformer.py
    │   │   ├── utils.py
    │   │   └── word_embeddings.py
    │   ├── mpu
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── initialize.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   ├── mup_substitute.py
    │   ├── neox_arguments
    │   │   ├── __init__.py
    │   │   ├── arguments.py
    │   │   ├── deepspeed_args.py
    │   │   ├── neox_args.py
    │   │   └── template.py
    │   ├── optimizers.py
    │   ├── text_generation_utils.py
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   ├── gpt2_tokenization.py
    │   │   ├── tokenizer.py
    │   │   └── train_tokenizer.py
    │   ├── training.py
    │   └── utils.py
    ├── prepare_data.py
    ├── requirements
    │   ├── requirements-dev.txt
    │   ├── requirements-flashattention.txt
    │   ├── requirements-onebitadam.txt
    │   ├── requirements-s3.txt
    │   ├── requirements-sparseattention.txt
    │   ├── requirements-tensorboard.txt
    │   ├── requirements-wandb.txt
    │   └── requirements.txt
    ├── scripts_swsok
    │   ├── 0.remove_nvidia_driver_and_cuda.sh
    │   ├── 1.cuda_11_7_install.sh
    │   ├── 10.run_and_collect_logs_single.sh
    │   ├── 11.cat_csv_from_log.sh
    │   ├── 12.run_and_collect_logs_multi.sh
    │   ├── 13.run_20B_and_collect_logs_multi.sh
    │   ├── 14.long_seqlen_6.7B.sh
    │   ├── 15.long_seqlen_1.3B.sh
    │   ├── 16.zero_opt_stages_1.3B.sh
    │   ├── 17.760M_zero_stages.sh
    │   ├── 2.docker_and_nvidia_container_toolkit_install.sh
    │   ├── 3.required_packages_install.sh
    │   ├── 4.requirements_install.sh
    │   ├── 5.prepare_dataset.sh
    │   ├── 6.pretrain_125M_local.sh
    │   ├── 7.patch_best_download.sh
    │   ├── 8.print_loss_progress.sh
    │   ├── 9.run_docker.sh
    │   └── run_sshd.sh
    ├── seccomp-docker.json
    ├── tests
    │   ├── README.md
    │   ├── __init__.py
    │   ├── common.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── test_fused_kernels.py
    │   │   ├── test_model_checkpoint.py
    │   │   ├── test_model_generation.py
    │   │   ├── test_model_instantiation.py
    │   │   └── test_model_train.py
    │   ├── neox_args
    │   │   ├── __init__.py
    │   │   ├── test_neoxargs_commandline.py
    │   │   ├── test_neoxargs_implementation.py
    │   │   ├── test_neoxargs_load.py
    │   │   └── test_neoxargs_usage.py
    │   ├── pytest.ini
    │   └── test_configs
    │   │   └── test_train_base.yml
    ├── tools
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bash
    │   │   ├── README.md
    │   │   ├── kill.sh
    │   │   ├── killall.sh
    │   │   ├── sync.sh
    │   │   ├── sync_cmd.sh
    │   │   └── syncdir.sh
    │   ├── ckpts
    │   │   ├── README.md
    │   │   ├── convert_hf_to_sequential.py
    │   │   ├── convert_module_to_hf.py
    │   │   ├── convert_raw_llama_weights_to_neox.py
    │   │   ├── convert_sequential_to_hf.py
    │   │   ├── inspect_checkpoints.py
    │   │   ├── merge20b.py
    │   │   └── upload.py
    │   ├── convert_hf_to_sequential.py
    │   ├── convert_module_to_hf.py
    │   ├── convert_raw_llama_weights_to_neox.py
    │   ├── convert_sequential_to_hf.py
    │   ├── corpora.py
    │   ├── datasets
    │   │   ├── README.md
    │   │   ├── corpora.py
    │   │   ├── merge_datasets.py
    │   │   ├── multinode_prepare_data.sh
    │   │   ├── preprocess_data.py
    │   │   └── preprocess_data_with_mask.py
    │   ├── inspect_checkpoints.py
    │   ├── kill.sh
    │   ├── killall.sh
    │   ├── merge20b.py
    │   ├── merge_datasets.py
    │   ├── merge_mp_partitions.py
    │   ├── multinode_prepare_data.sh
    │   ├── preprocess_data.py
    │   ├── preprocess_data_with_mask.py
    │   ├── sync.sh
    │   ├── sync_cmd.sh
    │   ├── syncdir.sh
    │   └── upload.py
    └── train.py
├── k8s_kubeflow_install
    ├── README.md
    ├── common
    │   ├── 00-prepare-nodes.sh
    │   ├── 01-install-cudnn-and-nvidia-driver.sh
    │   ├── 02-install-docker.sh
    │   ├── 03-install-nvidia-docker.sh
    │   ├── 04-install-k8s.sh
    │   ├── 05-init-k8s-master-only.sh
    │   ├── 06-install-kubeflow-master-only.sh
    │   ├── 07-certificate-kubeflow-master-only.sh
    │   ├── 08-port-forward-kubeflow-master-only.sh
    │   ├── 09-print-join-cmd.sh
    │   ├── 10-enable-k8s-dashboard-master-only.sh
    │   ├── 11-reset-k8s.sh
    │   ├── 12-add-kubeflow-user.sh
    │   ├── 13-port-forward-k8s-container.sh
    │   ├── 14-remove-a-node.sh
    │   ├── certificate.yaml
    │   ├── cluster-role-binding.yaml
    │   ├── dashboard-adminuser.yaml
    │   ├── gateway.yaml
    │   ├── profile.yaml
    │   ├── profile1.yaml
    │   ├── profile2.yaml
    │   ├── profile3.yaml
    │   ├── profile4.yaml
    │   ├── profile5.yaml
    │   ├── profile6.yaml
    │   ├── profile7.yaml
    │   ├── profile8.yaml
    │   └── profile9.yaml
    ├── docker
    │   ├── Dockerfile
    │   ├── Dockerfile.org
    │   ├── Dockerfile.scratch
    │   ├── cuda-requirements.txt
    │   ├── make_dockerimage.sh
    │   ├── requirements.txt
    │   └── s6
    │   │   ├── cont-init.d
    │   │       └── 01-copy-tmp-home
    │   │   └── services.d
    │   │       └── jupyterlab
    │   │           └── run
    ├── setup_for_gpu_node_master.sh
    └── setup_for_gpu_node_worker.sh
├── llama3_inference
    ├── README.md
    ├── llama3_inference_basic.py
    └── llama3_inference_memory_offload.py
├── mlperf
    ├── README.md
    └── pytorch-22.09
    │   ├── Dockerfile
    │   ├── LICENSE
    │   ├── NOTICE
    │   ├── README.md
    │   ├── README_2xa30_ngc22.09_pytorch.md
    │   ├── README_dgxa100_n512_ngc22.09_pytorch.md
    │   ├── README_dgxa100_n8_ngc22.09_pytorch.md
    │   ├── README_dgxa100_ngc22.09_pytorch.md
    │   ├── a30-run_and_time.sh
    │   ├── a30.sub
    │   ├── bmm1.py
    │   ├── bmm2.py
    │   ├── cleanup_scripts
    │       ├── chop_hdf5_files.py
    │       ├── clean.sh
    │       ├── cleanup_file.py
    │       ├── create_pretraining_data_wrapper.sh
    │       ├── create_wiki_test_set_md5_hashes.py
    │       ├── dataset_stats.py
    │       ├── do_gather.py
    │       ├── do_sentence_segmentation.py
    │       ├── extract_test_set_articles.py
    │       ├── parallel_create_hdf5.sh
    │       ├── process_wiki.sh
    │       ├── reshard_hdf5_files.py
    │       ├── transparency_in_test_set_generation.py
    │       └── wiki_test_set_md5.txt
    │   ├── config_A30_1x2x224x14.sh
    │   ├── config_A40_1x2x224x14.sh
    │   ├── config_DGXA100_1x4x56x2.sh
    │   ├── config_DGXA100_1x8x56x1.sh
    │   ├── config_DGXA100_4gpu_common.sh
    │   ├── config_DGXA100_512x8x2x1_pack.sh
    │   ├── config_DGXA100_8x8x48x1.sh
    │   ├── config_DGXA100_common.sh
    │   ├── convert_tf_checkpoint.py
    │   ├── extract_features.py
    │   ├── file_utils.py
    │   ├── fmha.py
    │   ├── function.py
    │   ├── fwd_loss_bwd_trainer.py
    │   ├── inference.py
    │   ├── input_preprocessing
    │       ├── 2048_shards_varlength.chk
    │       ├── 4320_shards_varlength.chk
    │       ├── chop_hdf5_files.py
    │       ├── chop_hdf5_files_to_varlength.py
    │       ├── clean.sh
    │       ├── cleanup_file.py
    │       ├── convert_fixed2variable.py
    │       ├── create_pretraining_data.py
    │       ├── create_pretraining_data_wrapper.sh
    │       ├── do_gather.py
    │       ├── do_sentence_segmentation.py
    │       ├── eval.md5
    │       ├── eval_varlength.chk
    │       ├── hdf5_md5.py
    │       ├── packed_data
    │       │   ├── README.md
    │       │   ├── create_packed_trainset.py
    │       │   ├── create_per_seqlength_data.py
    │       │   ├── gather_per_seqlength_data.py
    │       │   ├── generate_packing_strategy.py
    │       │   └── prepare_packed_data.sh
    │       ├── parallel_create_hdf5.sh
    │       ├── pick_eval_samples.py
    │       ├── pick_eval_samples_varlength.py
    │       ├── prepare_data.sh
    │       ├── process_wiki.sh
    │       ├── seperate_test_set.py
    │       ├── shuffle_samples.py
    │       ├── shuffle_samples_write.py
    │       ├── shuffle_samples_write.py_bak
    │       └── tokenization.py
    │   ├── mha.py
    │   ├── mhalib
    │       ├── mha_funcs.cu
    │       └── setup.py
    │   ├── mlperf_logger.py
    │   ├── model
    │       ├── __init__.py
    │       ├── layers
    │       │   ├── __init__.py
    │       │   ├── activations.py
    │       │   ├── attention.py
    │       │   ├── embeddings.py
    │       │   ├── fused.py
    │       │   └── layernorm.py
    │       ├── losses
    │       │   └── __init__.py
    │       └── models
    │       │   └── __init__.py
    │   ├── modeling.py
    │   ├── mounts.txt
    │   ├── optim
    │       └── distributed_fused_lamb.py
    │   ├── optimization.py
    │   ├── padding.py
    │   ├── requirements.txt
    │   ├── run.sub
    │   ├── run_and_time.sh
    │   ├── run_pretraining.py
    │   ├── run_squad.py
    │   ├── run_test.sh
    │   ├── run_with_docker.sh
    │   ├── scaleoutbridge.py
    │   ├── schedulers.py
    │   ├── scripts
    │       └── run_pretraining.sh
    │   ├── softmax.py
    │   ├── tokenization.py
    │   ├── unit_test
    │       ├── __init__.py
    │       ├── global_vars.py
    │       ├── test_bert_batch_1.py
    │       ├── test_bert_batch_7.py
    │       ├── test_data_path.py
    │       ├── test_embeddings_batch_1.py
    │       ├── test_encoders_batch_1.py
    │       ├── test_main.py
    │       └── unit_test_utils.py
    │   └── utils.py
├── opt_prime
    ├── README.md
    ├── demo
    │   ├── README.md
    │   ├── pp_train_gpt2.py
    │   ├── pp_train_llama-13b.py
    │   └── pp_train_llama-8b.py
    ├── examples
    │   ├── pp_train_bert.py
    │   ├── pp_train_electra.py
    │   ├── pp_train_gpt-neo.py
    │   ├── pp_train_gpt2-large.py
    │   ├── pp_train_gpt2-medium.py
    │   ├── pp_train_gpt2-xl-flops.py
    │   ├── pp_train_gpt2-xl.py
    │   ├── pp_train_gpt2.py
    │   ├── pp_train_gpt2_autocast.py
    │   ├── pp_train_gpt2_seq-cls.py
    │   ├── pp_train_gptj.py
    │   ├── pp_train_gptj2.py
    │   ├── pp_train_llama-small.py
    │   ├── pp_train_llama.py
    │   ├── pp_train_llama2.py
    │   ├── pp_train_llama3.py
    │   ├── pp_train_llama4.py
    │   ├── pp_train_llama5.py
    │   ├── pp_train_llama6.py
    │   ├── pp_train_llama7.py
    │   ├── pp_train_llama_autocast.py
    │   ├── pp_train_opt.py
    │   ├── pp_train_opt2.py
    │   ├── pp_train_synthetic.py
    │   ├── pp_train_synthetic2.py
    │   ├── pp_train_vit.py
    │   └── pp_train_whisper.py
    └── opt_prime
    │   ├── IR.py
    │   ├── __init__.py
    │   ├── comm.py
    │   ├── opti_pri.py
    │   └── schedule.py
└── torchgpipe_OOO_PP
    ├── LICENSE
    ├── README.md
    ├── gpipe_opt_synthetic3.py
    ├── gpipe_opt_synthetic3_gpu.py
    ├── gpipe_opt_synthetic4_gpu.py
    ├── gpipe_opt_synthetic5_gpu.py
    ├── gpipe_opt_transformer_gpu.py
    ├── gpipe_static_opt_synthetic2.py
    ├── gpipe_static_opt_synthetic2_gpu.py
    ├── gpipe_synthetic1.py
    └── gpipe_transformer_gpu.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022-present, ETRI, All rights reserved.
 4 | 
 5 | From PyTorch:
 6 | Copyright (c) 2014-     Facebook, Inc, All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without
 9 | modification, are permitted provided that the following conditions are met:
10 | 
11 | 1. Redistributions of source code must retain the above copyright notice, this
12 |    list of conditions and the following disclaimer.
13 | 
14 | 2. Redistributions in binary form must reproduce the above copyright notice,
15 |    this list of conditions and the following disclaimer in the documentation
16 |    and/or other materials provided with the distribution.
17 | 
18 | 3. Neither the name of the copyright holder nor the names of its
19 |    contributors may be used to endorse or promote products derived from
20 |    this software without specific prior written permission.
21 | 
22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # High-efficiency AI computing SW core technology project (AIcomp)
 2 | 
 3 | This project is for the development of low-cost and high-efficiency AI computing platform technology to overcome the inefficiency of excessive computing resource usage required for training large models and the dependency on specific high-cost hyperclusters for training such models.
 4 | 
 5 | We are developing a parallelization framework software called OptimusPrime, which now provides two-dimensional parallelization of pipeline parallel/data parallel and memory-efficient optimization features ( [opt_prime folder](./opt_prime) ).
 6 | 
 7 | 
 8 | Additionally, we have developed multiple PoCs related to this: In the early stages of this project, we presented the results of developing training PoCs that integrated Out Of Order technology (https://dl.acm.org/doi/pdf/10.1145/3492321.3519563) on top of torchgpipe ( [torchgpipe_OOO_PP folder](./torchgpipe_OOO_PP) ). In the next stage, we developed multiple PoCs that extract IR from the model and perform distributed training by partitioning it across multiple GPUs ( [compiler_fx folder](./compiler_fx) ).
 9 | 
10 | 
11 | In addition, we want to apply 3D to the model based on the compiler. In this regard, related PoCs are being developed preemptively, and the ETRI framework SW will be developed in earnest in the second half of the year.
12 | 
13 | 
14 | ## Features
15 | 
16 | An open-source AI training framework that provides automatic parallelization without model modifications ( [opt_prime](./opt_prime) )
17 | 
18 | * Enabling general model application for parallelization by removing constraints on model representation (compatible with Hugging Face models and PyTorch nn.Module)
19 | * Automatic parallelization (model split) without user intervention
20 | * Distributed parallel runtime supporting both Intra/Inter hosts concurrently (currently supports PP + DP)
21 | * An IR-based system aiming for flexible optimization at a global level
22 | * Memory optimization technology for CPU/GPU memory OOM avoidance
23 | 
24 | 
25 | ## License
26 | 
27 | The results of the AIcomp project are distributed under the 3-clause BSD license.
28 | 


--------------------------------------------------------------------------------
/compiler_fx/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022-present, ETRI, All rights reserved.
 4 | 
 5 | From PyTorch:
 6 | Copyright (c) 2014-     Facebook, Inc, All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without
 9 | modification, are permitted provided that the following conditions are met:
10 | 
11 | 1. Redistributions of source code must retain the above copyright notice, this
12 |    list of conditions and the following disclaimer.
13 | 
14 | 2. Redistributions in binary form must reproduce the above copyright notice,
15 |    this list of conditions and the following disclaimer in the documentation
16 |    and/or other materials provided with the distribution.
17 | 
18 | 3. Neither the name of the copyright holder nor the names of its
19 |    contributors may be used to endorse or promote products derived from
20 |    this software without specific prior written permission.
21 | 
22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/compiler_fx/README.md:
--------------------------------------------------------------------------------
 1 | ## 3D parallelism by compiler based FX IR
 2 | 
 3 | Current: FX IR-based Pipeline Parallelism PoC, FX IR manipulation PoC, and so on
 4 | 
 5 | Future work: Integration into Pytorch 2.0's compiler mechanism
 6 | 
 7 | ## License
 8 | 
 9 | The results of the AIcomp project are distributed under the 3-clause BSD license.
10 | 


--------------------------------------------------------------------------------
/compiler_fx/memory_usage2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.parameter import Parameter
 3 | import torch.nn as nn
 4 | from torch import Tensor
 5 | import torch.nn.functional as F
 6 | from torch.nn import init
 7 | import math
 8 | 
 9 | import psutil
10 | import os
11 | pid = os.getpid()
12 | print(f">> Process ID: {pid}")
13 | 
14 | #use_reset_parameters = False
15 | use_reset_parameters = True
16 | 
17 | print_flag = True
18 | 
19 | def print_memory_usage(str, print_flag):
20 |     if print_flag == True:
21 |         print(" =========", str, "=========")
22 |         my_process = psutil.Process(pid)
23 |         usage =  my_process.memory_info().rss / (1024 ** 3)   # GB unit
24 |         print(f" Memory Usage: {usage:.3f} GB")
25 | 
26 | 
27 | class Linear2(nn.Module):
28 |     __constants__ = ['in_features', 'out_features']
29 |     in_features: int
30 |     out_features: int
31 |     weight: torch.Tensor
32 | 
33 |     def __init__(self, in_features: int, out_features: int):
34 |         super(Linear2, self).__init__()
35 |         self.in_features = in_features
36 |         self.out_features = out_features
37 | 
38 |         self.weight = Parameter(torch.empty((out_features, in_features)))
39 |         self.bias = Parameter(torch.empty(out_features))
40 | 
41 |         if use_reset_parameters == True:
42 |             self.reset_parameters()
43 | 
44 |     def forward(self, input: Tensor) -> Tensor:
45 |         return F.linear(input, self.weight, self.bias)
46 | 
47 |     def reset_parameters(self) -> None:
48 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
49 | 
50 |         fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
51 |         bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
52 |         init.uniform_(self.bias, -bound, bound)
53 | 
54 | print_memory_usage("Before: m = Linear2(10000, 20000)", print_flag)
55 | 
56 | m = Linear2(10000, 20000)
57 | 
58 | print(f" ***** use reset_parameters() : {use_reset_parameters} *****")
59 | print_memory_usage("After: m = Linear2(10000, 20000)", print_flag)
60 | print(f"{m.weight}")
61 | input = torch.randn(20000, 10000)
62 | print_memory_usage("After: input = torch.randn(20000, 10000)", print_flag)
63 | output = m(input)
64 | print_memory_usage("After: output = m(input)", print_flag)
65 | print(f"{output.size()}")
66 | 
67 | 


--------------------------------------------------------------------------------
/compiler_fx/util_find_linear.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModel
 3 | import sys
 4 | import os
 5 | 
 6 | def find_linear_modules(model):
 7 |     linear_modules = []
 8 | 
 9 |     for name, module in model.named_modules():
10 |         if isinstance(module, torch.nn.Linear):
11 |             linear_modules.append((name, module))
12 | 
13 |     return linear_modules
14 | 
15 | def find_linear_modules2(model):
16 |     linear_modules = []
17 | 
18 |     def recursive_search(prefix, module):
19 |         for name, sub_module in module.named_children():
20 |             full_name = f"{prefix}.{name}" if prefix else name
21 |             if isinstance(sub_module, torch.nn.Linear):
22 |             #if isinstance(sub_module, torch.nn.Linear) or isinstance(sub_module, torch.nn.Conv1d) or hasattr(sub_module, "weight"):
23 |                 linear_modules.append((full_name, sub_module))
24 |             else:
25 |                 recursive_search(full_name, sub_module)
26 | 
27 |     recursive_search("", model)
28 | 
29 |     return linear_modules
30 | 
31 | if __name__ == "__main__":
32 |     #print(f"len(sys.argv) --> {len(sys.argv)}")
33 |     #print(f"sys.argv[0] --> {sys.argv[0]}")
34 | 
35 | 
36 |     #model_name = "bert-base-uncased" 
37 |     #model_name = "openai/whisper-base" 
38 |     model_name = "facebook/opt-350m" 
39 | 
40 |     model = AutoModel.from_pretrained(model_name)
41 | 
42 |     #print(f"model: {model}")
43 | 
44 |     linear_layers = find_linear_modules(model)
45 |     #linear_layers = find_linear_modules2(model)
46 | 
47 |     if linear_layers:
48 |         for name, module in linear_layers:
49 |             print(f"- {name}: {module}")
50 |         print(f">> found: {len(linear_layers)} nn.Linear")
51 |     else:
52 |         print("Not find nn.Linear")
53 | 
54 | 


--------------------------------------------------------------------------------
/compiler_fx/util_setup_mesh.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import math
 5 | 
 6 | import torch
 7 | import torch.distributed as dist
 8 | 
 9 | from torch.distributed.device_mesh import init_device_mesh
10 | 
11 | 
12 | rank = int(os.environ["RANK"])
13 | local_rank = int(os.environ["LOCAL_RANK"])
14 | world_size = int(os.environ["WORLD_SIZE"])
15 | master_addr = os.getenv("MASTER_ADDR")
16 | master_port = os.getenv("MASTER_PORT")
17 | 
18 | #
19 | # world size 8
20 | #
21 | pp_size = 2
22 | tp_size = 2
23 | dp_size = 2
24 | 
25 | #
26 | # world size 16
27 | #
28 | #pp_size = 8
29 | #tp_size = 2
30 | #dp_size = 1
31 | 
32 | #
33 | # world size 12
34 | #
35 | #pp_size = 3
36 | #dp_size = 2
37 | #tp_size = 2
38 | 
39 | 
40 | assert world_size == pp_size * dp_size * tp_size, f"pp_size({pp_size}) * dp_size({dp_size}) * tp_size({tp_size}) must be equal to world_size({world_size})"
41 | assert world_size % tp_size == 0, f"world size({world_size}) must be divisible by tp size({tp_size})"
42 | assert world_size % dp_size == 0, f"world size({world_size}) must be divisible by dp size({dp_size})"
43 | 
44 | 
45 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
46 | torch.cuda.set_device(local_rank)
47 | 
48 | device = torch.device(f"cuda:{local_rank}")
49 | 
50 | device_mesh = init_device_mesh("cuda", mesh_shape=(pp_size, dp_size, tp_size), mesh_dim_names=("pp", "dp", "tp"))
51 | tp_group = device_mesh["tp"].get_group()
52 | dp_group = device_mesh["dp"].get_group()
53 | pp_group = device_mesh["pp"].get_group()
54 | tp_mesh = device_mesh["tp"]
55 | dp_mesh = device_mesh["dp"]
56 | pp_mesh = device_mesh["pp"]
57 | 
58 | 
59 | print(f"[{rank}] >>>  pp group:{pp_mesh}, dp_group:{dp_mesh}, tp_group:{tp_mesh}")
60 | 
61 | time.sleep(2)
62 | 
63 | print(f"[rank:{rank}, run completed ...")
64 | 


--------------------------------------------------------------------------------
/gpt-neox/.dockerignore:
--------------------------------------------------------------------------------
1 | 20B_checkpoints/
2 | 


--------------------------------------------------------------------------------
/gpt-neox/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v4.1.0
 4 |       hooks:
 5 |           - id: check-case-conflict
 6 |           - id: check-json
 7 |           - id: check-symlinks
 8 |           - id: check-yaml
 9 |           - id: destroyed-symlinks
10 |           - id: end-of-file-fixer
11 |             exclude: docs/CNAME
12 |           - id: fix-byte-order-marker
13 |           - id: fix-encoding-pragma
14 |             args: [--remove]
15 |           - id: mixed-line-ending
16 |             args: [--fix=lf]
17 |           - id: requirements-txt-fixer
18 |           - id: trailing-whitespace
19 |     - repo: https://gitlab.com/daverona/pre-commit-cpp
20 |       rev: 0.8.0
21 |       hooks:
22 |           - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
23 |             args: []
24 | 
25 |     - repo: https://github.com/psf/black
26 |       rev: 22.3.0
27 |       hooks:
28 |           - id: black
29 |             language_version: python3
30 |     - repo: https://github.com/codespell-project/codespell
31 |       rev: v2.1.0
32 |       hooks:
33 |       - id: codespell
34 |         args: [
35 |               '--ignore-words-list=reord,dout',  # Word used in error messages that need rewording
36 |               --check-filenames,
37 |               --check-hidden,
38 |           ]
39 | 


--------------------------------------------------------------------------------
/gpt-neox/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # YAML 1.2
 2 | ---
 3 | authors:
 4 |   - affiliation: EleutherAI
 5 |     family-names: Andonian
 6 |     given-names: Alex
 7 |   - affiliation: EleutherAI
 8 |     family-names: Anthony
 9 |     given-names: Quentin
10 |   - affiliation: EleutherAI
11 |     family-names: Biderman
12 |     given-names: Stella
13 |   - affiliation: EleutherAI
14 |     family-names: Black
15 |     given-names: Sid
16 |   - affiliation: EleutherAI
17 |     family-names: Gali
18 |     given-names: Preetham
19 |   - affiliation: EleutherAI
20 |     family-names: Gao
21 |     given-names: Leo
22 |   - affiliation: EleutherAI
23 |     family-names: Hallahan
24 |     given-names: Eric
25 |   - affiliation: EleutherAI
26 |     family-names: Levy-Kramer
27 |     given-names: Josh
28 |   - affiliation: EleutherAI
29 |     family-names: Leahy
30 |     given-names: Connor
31 |   - affiliation: EleutherAI
32 |     family-names: Nestler
33 |     given-names: Lucas
34 |   - affiliation: EleutherAI
35 |     family-names: Parker
36 |     given-names: Kip
37 |   - affiliation: EleutherAI
38 |     family-names: Pieler
39 |     given-names: Michael
40 |   - affiliation: EleutherAI
41 |     family-names: Phang
42 |     given-names: Jason
43 |   - affiliation: EleutherAI
44 |     family-names: Purohit
45 |     given-names: Shivanshu
46 |   - affiliation: EleutherAI
47 |     family-names: Schoelkopf
48 |     given-names: Hailey
49 |   - affiliation: EleutherAI
50 |     family-names: Stander
51 |     given-names: Dashiell
52 |   - affiliation: EleutherAI
53 |     family-names: Songz
54 |     given-names: Tri
55 |   - affiliation: EleutherAI
56 |     family-names: Tigges
57 |     given-names: Curt
58 |   - affiliation: EleutherAI
59 |     family-names: Thérien
60 |     given-names: Benjamin
61 |   - affiliation: EleutherAI
62 |     family-names: Wang
63 |     given-names: Phil
64 |   - affiliation: EleutherAI
65 |     family-names: Weinbach
66 |     given-names: Samuel
67 | cff-version: "1.1.0"
68 | keywords:
69 |   - "Transformers"
70 |   - "Massive language model"
71 |   - "Autoregressive language model"
72 | license: "Apache-2.0"
73 | message: "If you use this software, please cite it using these metadata."
74 | repository-code: "https://www.github.com/eleutherai/gpt-neox"
75 | title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch"
76 | version: "2.0.0"
77 | doi: "10.5281/zenodo.5879544"
78 | date-released: 2021-08-23
79 | ...
80 | 


--------------------------------------------------------------------------------
/gpt-neox/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | 


--------------------------------------------------------------------------------
/gpt-neox/README-MUP.md:
--------------------------------------------------------------------------------
 1 | # How to use Mup (https://github.com/microsoft/mup)
 2 | 
 3 | ## Add mup neox args to your config
 4 | 
 5 | ```
 6 | # mup
 7 | 
 8 | "use-mup": true,
 9 | 
10 | "save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank
11 | 
12 | "base-shapes-file": "base-shapes", # load base shapes from this file
13 | 
14 | "coord-check": false, # generate coord check plots to verify mup's implementation in neox
15 | 
16 | # mup hp search
17 | 
18 | "mup-init-scale": 1.0,
19 | 
20 | "mup-attn-temp": 1.0,
21 | 
22 | "mup-output-temp": 1.0,
23 | 
24 | "mup-embedding-mult": 1.0,
25 | 
26 | "mup-rp-embedding-mult": 1.0,
27 | ```
28 | 
29 | ## Generate base shapes
30 | 
31 | 1. Set use-mup to true
32 | 2. Set save-base-shapes to true
33 | 3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named <base-shapes-file>.<rank>. gpt-neox will exit immediately.
34 | 4. Set save-base-shapes to false
35 | 
36 | ## Generate coord check plots (optional)
37 | 
38 | 1. Keep use-mup true
39 | 2. Set coord-check to true
40 | 3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately
41 | 4. Set coord-check to false
42 | 
43 | ## Tune mup hyperparameters and LR
44 | 
45 | The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml).
46 | 
47 | ## Transfer
48 | 
49 | With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again.
50 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/1.3B-32k-len-conf.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "model_parallel_size": 1,
 3 |    "make_vocab_size_divisible_by": 1,
 4 | 
 5 |    "num_layers": 24,
 6 |    "hidden_size": 2048,
 7 |    "num_attention_heads": 16,
 8 |    "seq_length": 32768,
 9 |    "max_position_embeddings": 32768,
10 |    "pos_emb": "rotary",
11 |    "rotary_pct": 1,
12 |    "rotary_emb_base": 10000,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": false,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], all]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": false,
21 |    "use_bias_in_norms": false,
22 |    "use_bias_in_attn_linear": false,
23 | 
24 |    "init_method": "small_init",
25 |    "output_layer_init_method": "wang_init",
26 | 
27 |    "optimizer": {
28 |      "type": "adam",
29 |      "params": {
30 |        "lr": 0.0001,
31 |        "betas": [0.9, 0.95],
32 |        "eps":  1.0e-6,
33 |      }
34 |    },
35 |    "min_lr": 0.00001,
36 | 
37 |    "zero_optimization": {
38 |     "stage": 0,
39 |     "offload_param": {
40 |         "device": "cpu"
41 |     },
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 | #   "train_batch_size": 128,
51 | #   "train_batch_size": 32,
52 |    "gradient_accumulation_steps": 64,
53 |    "split": "960,35,5",
54 |    "train_micro_batch_size_per_gpu": 1,
55 |    "data_impl": "mmap",
56 | 
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    "train_iters": 20,
77 |    "lr_decay_iters": 20,
78 |    "distributed_backend": "nccl",
79 |    "lr_decay_style": "cosine",
80 |    "warmup": 0.05,
81 |    "checkpoint_factor": 450,
82 |    "eval_interval": 1800,
83 |    "eval_iters": 10,
84 | 
85 |    "log_interval": 10,
86 |    "steps_per_print": 1,
87 |    "keep_last_n_checkpoints": 10,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/125M-32k-len-conf.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "model_parallel_size": 1,
 3 |    "make_vocab_size_divisible_by": 1,
 4 | 
 5 |    "num_layers": 12,
 6 |    "hidden_size": 768,
 7 |    "num_attention_heads": 12,
 8 |    "seq_length": 32768,
 9 |    "max_position_embeddings": 32768,
10 |    "pos_emb": "rotary",
11 |    "rotary_pct": 1,
12 |    "rotary_emb_base": 10000,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": false,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], all]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": false,
21 |    "use_bias_in_norms": false,
22 |    "use_bias_in_attn_linear": false,
23 | 
24 |    "init_method": "small_init",
25 |    "output_layer_init_method": "wang_init",
26 | 
27 |    "optimizer": {
28 |      "type": "adam",
29 |      "params": {
30 |        "lr": 0.0001,
31 |        "betas": [0.9, 0.95],
32 |        "eps":  1.0e-6,
33 |      }
34 |    },
35 |    "min_lr": 0.00001,
36 | 
37 |    "zero_optimization": {
38 |     "stage": 3,
39 |     "offload_param": {
40 |         "device": "cpu"
41 |     },
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |    "train_batch_size": 128,
51 | #   "train_batch_size": 32,
52 |    "gradient_accumulation_steps": 8,
53 |    "split": "960,35,5",
54 |    "train_micro_batch_size_per_gpu": 2,
55 |    "data_impl": "mmap",
56 | 
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    "train_iters": 200,
77 |    "lr_decay_iters": 200,
78 |    "distributed_backend": "nccl",
79 |    "lr_decay_style": "cosine",
80 |    "warmup": 0.05,
81 |    "checkpoint_factor": 450,
82 |    "eval_interval": 1800,
83 |    "eval_iters": 10,
84 | 
85 |    "log_interval": 10,
86 |    "steps_per_print": 1,
87 |    "keep_last_n_checkpoints": 10,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/2.7B-32k-len-conf.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "model_parallel_size": 1,
 3 |    "make_vocab_size_divisible_by": 1,
 4 | 
 5 |    "num_layers": 32,
 6 |    "hidden_size": 2560,
 7 |    "num_attention_heads": 32,
 8 |    "seq_length": 32768,
 9 |    "max_position_embeddings": 32768,
10 |    "pos_emb": "rotary",
11 |    "rotary_pct": 1,
12 |    "rotary_emb_base": 10000,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": false,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], all]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": false,
21 |    "use_bias_in_norms": false,
22 |    "use_bias_in_attn_linear": false,
23 | 
24 |    "init_method": "small_init",
25 |    "output_layer_init_method": "wang_init",
26 | 
27 |    "optimizer": {
28 |      "type": "adam",
29 |      "params": {
30 |        "lr": 0.0001,
31 |        "betas": [0.9, 0.95],
32 |        "eps":  1.0e-6,
33 |      }
34 |    },
35 |    "min_lr": 0.00001,
36 | 
37 |    "zero_optimization": {
38 |     "stage": 3,
39 |     "offload_param": {
40 |         "device": "cpu"
41 |     },
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |    "train_batch_size": 128,
51 | #   "train_batch_size": 32,
52 |    "gradient_accumulation_steps": 8,
53 |    "split": "960,35,5",
54 |    "train_micro_batch_size_per_gpu": 2,
55 |    "data_impl": "mmap",
56 | 
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    "train_iters": 200,
77 |    "lr_decay_iters": 200,
78 |    "distributed_backend": "nccl",
79 |    "lr_decay_style": "cosine",
80 |    "warmup": 0.05,
81 |    "checkpoint_factor": 450,
82 |    "eval_interval": 1800,
83 |    "eval_iters": 10,
84 | 
85 |    "log_interval": 10,
86 |    "steps_per_print": 1,
87 |    "keep_last_n_checkpoints": 10,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/250M-32k-len-conf.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "model_parallel_size": 1,
 3 |    "make_vocab_size_divisible_by": 1,
 4 | 
 5 |    "num_layers": 12,
 6 |    "hidden_size": 1024,
 7 |    "num_attention_heads": 16,
 8 |    "seq_length": 32768,
 9 |    "max_position_embeddings": 32768,
10 |    "pos_emb": "rotary",
11 |    "rotary_pct": 1,
12 |    "rotary_emb_base": 10000,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": false,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], all]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": false,
21 |    "use_bias_in_norms": false,
22 |    "use_bias_in_attn_linear": false,
23 | 
24 |    "init_method": "small_init",
25 |    "output_layer_init_method": "wang_init",
26 | 
27 |    "optimizer": {
28 |      "type": "adam",
29 |      "params": {
30 |        "lr": 0.0001,
31 |        "betas": [0.9, 0.95],
32 |        "eps":  1.0e-6,
33 |      }
34 |    },
35 |    "min_lr": 0.00001,
36 | 
37 |    "zero_optimization": {
38 |     "stage": 3,
39 |     "offload_param": {
40 |         "device": "cpu"
41 |     },
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 |    "train_batch_size": 128,
51 | #   "train_batch_size": 32,
52 |    "gradient_accumulation_steps": 8,
53 |    "split": "960,35,5",
54 |    "train_micro_batch_size_per_gpu": 2,
55 |    "data_impl": "mmap",
56 | 
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    "train_iters": 200,
77 |    "lr_decay_iters": 200,
78 |    "distributed_backend": "nccl",
79 |    "lr_decay_style": "cosine",
80 |    "warmup": 0.05,
81 |    "checkpoint_factor": 450,
82 |    "eval_interval": 1800,
83 |    "eval_iters": 10,
84 | 
85 |    "log_interval": 10,
86 |    "steps_per_print": 1,
87 |    "keep_last_n_checkpoints": 10,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/6.7B-32k-len-conf.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 4,
 3 |    "model_parallel_size": 8,
 4 |    "make_vocab_size_divisible_by": 1,
 5 | 
 6 |    "num_layers": 32,
 7 |    "hidden_size": 4096,
 8 |    "num_attention_heads": 32,
 9 |    "seq_length": 32768,
10 |    "max_position_embeddings": 32768,
11 |    "pos_emb": "rotary",
12 |    "rotary_pct": 1,
13 |    "rotary_emb_base": 10000,
14 |    "no_weight_tying": true,
15 |    "gpt_j_residual": false,
16 |    "output_layer_parallelism": "column",
17 | 
18 |    "attention_config": [[["flash"], 32]],
19 | 
20 |    "scaled_upper_triang_masked_softmax_fusion": true,
21 |    "bias_gelu_fusion": false,
22 |    "use_bias_in_norms": false,
23 |    "use_bias_in_attn_linear": false,
24 | 
25 |    "init_method": "small_init",
26 |    "output_layer_init_method": "wang_init",
27 | 
28 |    "optimizer": {
29 |      "type": "adam",
30 |      "params": {
31 |        "lr": 0.0001,
32 |        "betas": [0.9, 0.95],
33 |        "eps":  1.0e-6,
34 |      }
35 |    },
36 |    "min_lr": 0.00001,
37 | 
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "offload_param": {
41 |         "device": "cpu"
42 |     },
43 |     "allgather_partitions": True,
44 |     "allgather_bucket_size": 500000000,
45 |     "overlap_comm": True,
46 |     "reduce_scatter": True,
47 |     "reduce_bucket_size": 500000000,
48 |     "contiguous_gradients": True,
49 |   },
50 | 
51 | #   "train_batch_size": 128,
52 | #   "train_batch_size": 32,
53 |    "gradient_accumulation_steps": 32,
54 |    "split": "960,35,5",
55 |    "train_micro_batch_size_per_gpu": 4,
56 |    "data_impl": "mmap",
57 | 
58 |    "checkpoint_activations": true,
59 |    "checkpoint_num_layers": 1,
60 |    "partition_activations": true,
61 |    "synchronize_each_layer": true,
62 | 
63 |    "gradient_clipping": 1.0,
64 |    "weight_decay": 0.1,
65 |    "hidden_dropout": 0,
66 |    "attention_dropout": 0,
67 | 
68 |    "fp16": {
69 |      "fp16": true,
70 |      "enabled": true,
71 |      "loss_scale": 0,
72 |      "loss_scale_window": 1000,
73 |      "hysteresis": 2,
74 |      "min_loss_scale": 1
75 |    },
76 | 
77 |    "train_iters": 200,
78 |    "lr_decay_iters": 200,
79 |    "distributed_backend": "nccl",
80 |    "lr_decay_style": "cosine",
81 |    "warmup": 0.05,
82 |    "checkpoint_factor": 450,
83 |    "eval_interval": 1800,
84 |    "eval_iters": 10,
85 | 
86 |    "log_interval": 10,
87 |    "steps_per_print": 1,
88 |    "keep_last_n_checkpoints": 10,
89 |    "wall_clock_breakdown": true,
90 | }
91 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/760M-32k-len-conf.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "model_parallel_size": 1,
 3 |    "make_vocab_size_divisible_by": 1,
 4 | 
 5 |    "num_layers": 24,
 6 |    "hidden_size": 1536,
 7 |    "num_attention_heads": 16,
 8 |    "seq_length": 32768,
 9 |    "max_position_embeddings": 32768,
10 |    "pos_emb": "rotary",
11 |    "rotary_pct": 1,
12 |    "rotary_emb_base": 10000,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": false,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], all]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": false,
21 |    "use_bias_in_norms": false,
22 |    "use_bias_in_attn_linear": false,
23 | 
24 |    "init_method": "small_init",
25 |    "output_layer_init_method": "wang_init",
26 | 
27 |    "optimizer": {
28 |      "type": "adam",
29 |      "params": {
30 |        "lr": 0.0001,
31 |        "betas": [0.9, 0.95],
32 |        "eps":  1.0e-6,
33 |      }
34 |    },
35 |    "min_lr": 0.00001,
36 | 
37 |    "zero_optimization": {
38 |    "stage": 3,
39 |     "offload_param": {
40 |         "device": "cpu"
41 |     },
42 |     "allgather_partitions": True,
43 |     "allgather_bucket_size": 500000000,
44 |     "overlap_comm": True,
45 |     "reduce_scatter": True,
46 |     "reduce_bucket_size": 500000000,
47 |     "contiguous_gradients": True,
48 |   },
49 | 
50 | #   "train_batch_size": 128,
51 | #   "train_batch_size": 32,
52 |    "gradient_accumulation_steps": 64,
53 |    "split": "960,35,5",
54 |    "train_micro_batch_size_per_gpu": 2,
55 |    "data_impl": "mmap",
56 | 
57 |    "checkpoint_activations": true,
58 |    "checkpoint_num_layers": 1,
59 |    "partition_activations": true,
60 |    "synchronize_each_layer": true,
61 | 
62 |    "gradient_clipping": 1.0,
63 |    "weight_decay": 0.1,
64 |    "hidden_dropout": 0,
65 |    "attention_dropout": 0,
66 | 
67 |    "fp16": {
68 |      "fp16": true,
69 |      "enabled": true,
70 |      "loss_scale": 0,
71 |      "loss_scale_window": 1000,
72 |      "hysteresis": 2,
73 |      "min_loss_scale": 1
74 |    },
75 | 
76 |    "train_iters": 10,
77 |    "lr_decay_iters": 10,
78 |    "distributed_backend": "nccl",
79 |    "lr_decay_style": "cosine",
80 |    "warmup": 0.05,
81 |    "checkpoint_factor": 450,
82 |    "eval_interval": 1800,
83 |    "eval_iters": 10,
84 | 
85 |    "log_interval": 5,
86 |    "steps_per_print": 1,
87 |    "keep_last_n_checkpoints": 10,
88 |    "wall_clock_breakdown": true,
89 | }
90 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/autotuning_configs/small_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 | 
 5 |    "num-layers": 12,
 6 |    "hidden-size": 768,
 7 |    "num-attention-heads": 12,
 8 |    "seq-length": 2048,
 9 |    "max-position-embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos-emb": "rotary",
12 |    "no-weight-tying": true,
13 | 
14 |    "scaled-upper-triang-masked-softmax-fusion": false,
15 |    "bias-gelu-fusion": false,
16 | 
17 | 
18 |    "optimizer": {
19 |      "type": "Adam",
20 |      "params": {
21 |        "lr": 0.0006,
22 |        "betas": [0.9, 0.999],
23 |        "eps": 1.0e-8
24 |      }
25 |    },
26 | 
27 |    "train_micro_batch_size_per_gpu": 1,
28 |    "data-impl": "mmap",
29 |    "split": "949,50,1",
30 | 
31 |    "checkpoint-activations": true,
32 |    "checkpoint-num-layers": 1,
33 |    "partition-activations": true,
34 |    "synchronize-each-layer": true,
35 | 
36 |    "gradient_clipping": 1.0,
37 |    "weight-decay": 0.0,
38 |    "hidden-dropout": 0.0,
39 |    "attention-dropout": 0.0,
40 | 
41 |    "fp16": {
42 |      "enabled": true,
43 |      "loss_scale": 0,
44 |      "loss_scale_window": 1000,
45 |      "hysteresis": 2,
46 |      "min_loss_scale": 1
47 |    },
48 | 
49 |    "train-iters": 320000,
50 |    "lr-decay-iters": 320000,
51 |    "distributed-backend": "nccl",
52 |    "lr-decay-style": "cosine",
53 |    "warmup": 0.01,
54 |    "save-interval": 10000,
55 |    "eval-interval": 1000,
56 |    "eval-iters": 10,
57 | 
58 |    "log-interval": 100,
59 |    "steps_per_print": 10,
60 |    "keep-last-n-checkpoints": 4,
61 |    "wall_clock_breakdown": true,
62 |    "launcher": "slurm",
63 |    "deepspeed_slurm": true,
64 |    "comment": "neox",
65 |    "autotuning": {
66 |        "enabled": true,
67 |        "arg_mappings": {
68 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
69 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
70 |      }
71 |    },
72 |    "zero_optimization": {
73 |       "stage": [0, 1, 2, 3]
74 |    },
75 |   "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
76 |   "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
77 |   "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
78 | }
79 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/autotuning_configs/tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 |    "num-layers": 12,
 5 |    "hidden-size": 768,
 6 |    "num-attention-heads": 12,
 7 |    "seq-length": 2048,
 8 |    "max-position-embeddings": 2048,
 9 |    "norm": "layernorm",
10 |    "pos-emb": "rotary",
11 |    "no-weight-tying": true,
12 |    "scaled-upper-triang-masked-softmax-fusion": true,
13 |    "bias-gelu-fusion": true,
14 |    "optimizer": {
15 |      "type": "Adam",
16 |      "params": {
17 |        "lr": 0.0006,
18 |        "betas": [0.9, 0.999],
19 |        "eps": 1.0e-8
20 |      }
21 |    },
22 |    "zero_optimization": {
23 |     "stage": 0,
24 |     "allgather_partitions": true,
25 |     "allgather_bucket_size": 500000000,
26 |     "overlap_comm": true,
27 |     "reduce_scatter": true,
28 |     "reduce_bucket_size": 500000000,
29 |     "contiguous_gradients": true,
30 |     "cpu_offload": false
31 |   },
32 |    "train_micro_batch_size_per_gpu": 1,
33 |    "autotuning_config": {
34 |      "enabled": true,
35 |      "arg_mappings": {
36 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
37 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
38 |      }
39 |    },
40 |    "data-impl": "mmap",
41 |    "split": "949,50,1",
42 |    "checkpoint-activations": true,
43 |    "checkpoint-num-layers": 1,
44 |    "partition-activations": true,
45 |    "synchronize-each-layer": true,
46 |    "gradient_clipping": 1.0,
47 |    "weight-decay": 0.0,
48 |    "hidden-dropout": 0.0,
49 |    "attention-dropout": 0.0,
50 |    "fp16": {
51 |      "enabled": true,
52 |      "loss_scale": 0,
53 |      "loss_scale_window": 1000,
54 |      "hysteresis": 2,
55 |      "min_loss_scale": 1
56 |    },
57 |    "train-iters": 200,
58 |    "lr-decay-iters": 320000,
59 |    "distributed-backend": "nccl",
60 |    "lr-decay-style": "cosine",
61 |    "warmup": 0.01,
62 |    "save-interval": 10000,
63 |    "eval-interval": 1000,
64 |    "eval-iters": 10,
65 |    "log-interval": 100,
66 |    "steps_per_print": 10,
67 |    "keep-last-n-checkpoints": 4,
68 |    "wall_clock_breakdown": true,
69 |    "launcher": "slurm",
70 |    "deepspeed_slurm": true,
71 |    "comment": "neox"
72 | }
73 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/autotuning_configs/tune_1-3B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 1,
 4 | 
 5 |    "num-layers": 24,
 6 |    "hidden-size": 2048,
 7 |    "num-attention-heads": 16,
 8 |    "seq-length": 2048,
 9 |    "max-position-embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos-emb": "rotary",
12 |    "no-weight-tying": true,
13 |    "gpt_j_residual": false,
14 |    "output_layer_parallelism": "column",
15 |    "attention_config": [[["flash"], 24]],
16 |    "scaled-upper-triang-masked-softmax-fusion": false,
17 |    "bias-gelu-fusion": false,
18 | 
19 |    "init_method": "small_init",
20 |    "output_layer_init_method": "wang_init",
21 | 
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.0002,
26 |        "betas": [0.9, 0.95],
27 |        "eps":  1.0e-8
28 |      }
29 |    },
30 |    "min_lr": 0.00002,
31 | 
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": true
40 |   },
41 |   "train_micro_batch_size_per_gpu": 1,
42 |    "autotuning": {
43 |      "enabled": true,
44 |      "arg_mappings": {
45 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
46 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
47 |      }
48 |    },
49 |    "data-impl": "mmap",
50 | 
51 |    "checkpoint-activations": false,
52 |    "checkpoint-num-layers": 1,
53 |    "partition-activations": true,
54 |    "synchronize-each-layer": true,
55 | 
56 |    "gradient_clipping": 1.0,
57 |    "weight-decay": 0.1,
58 |    "hidden-dropout": 0,
59 |    "attention-dropout": 0,
60 | 
61 |    "fp16": {
62 |      "fp16": true,
63 |      "enabled": true,
64 |      "loss_scale": 0,
65 |      "loss_scale_window": 1000,
66 |      "hysteresis": 2,
67 |      "min_loss_scale": 1
68 |    },
69 | 
70 |    "train-iters": 320000,
71 |    "lr-decay-iters": 320000,
72 |    "distributed-backend": "nccl",
73 |    "lr-decay-style": "cosine",
74 |    "warmup": 0.01,
75 |    "checkpoint-factor": 10000,
76 |    "eval-interval": 1000,
77 |    "eval-iters": 10,
78 |    "launcher": "slurm",
79 |    "deepspeed_slurm": true,
80 |    "no_ssh_check": true,
81 | 
82 |    "log-interval": 10,
83 |    "steps_per_print": 10,
84 |    "keep-last-n-checkpoints": 1,
85 |    "wall_clock_breakdown": true
86 | }
87 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/autotuning_configs/tune_6-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe-parallel-size": 1,
 3 |    "model-parallel-size": 8,
 4 | 
 5 |    "num-layers": 32,
 6 |    "hidden-size": 4096,
 7 |    "num-attention-heads": 32,
 8 |    "seq-length": 2048,
 9 |    "max-position-embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos-emb": "rotary",
12 |    "no-weight-tying": true,
13 | 
14 |    "scaled-upper-triang-masked-softmax-fusion": false,
15 |    "bias-gelu-fusion": false,
16 | 
17 | 
18 |    "optimizer": {
19 |      "type": "Adam",
20 |      "params": {
21 |        "lr": 0.00012,
22 |        "betas": [0.9, 0.999],
23 |        "eps": 1.0e-8
24 |      }
25 |    },
26 | 
27 |    "train_micro_batch_size_per_gpu": 1,
28 |    "zero_optimization": {
29 |       "stage": [0, 1, 2, 3]
30 |    },
31 |    "data-impl": "mmap",
32 |    "split": "949,50,1",
33 | 
34 |    "checkpoint-activations": true,
35 |    "checkpoint-num-layers": 1,
36 |    "partition-activations": true,
37 |    "synchronize-each-layer": true,
38 | 
39 |    "gradient_clipping": 1.0,
40 |    "weight-decay": 0,
41 |    "hidden-dropout": 0,
42 |    "attention-dropout": 0,
43 | 
44 |    "fp16": {
45 |      "fp16": true,
46 |      "enabled": true,
47 |      "loss_scale": 0,
48 |      "loss_scale_window": 1000,
49 |      "hysteresis": 2,
50 |      "min_loss_scale": 1
51 |    },
52 | 
53 |    "train-iters": 100,
54 |    "lr-decay-iters": 320000,
55 |    "distributed-backend": "nccl",
56 |    "lr-decay-style": "cosine",
57 |    "warmup": 0.01,
58 |    "checkpoint-factor": 10000,
59 |    "eval-interval": 1000,
60 |    "eval-iters": 10,
61 |    "log-interval": 100,
62 |    "steps_per_print": 10,
63 |    "keep-last-n-checkpoints": 4,
64 |    "wall_clock_breakdown": true,
65 |    "launcher": "slurm",
66 |    "deepspeed_slurm": true,
67 |    "no_ssh_check": true,
68 |    "comment": "neox",
69 |    "autotuning": {
70 |        "enabled": true,
71 |        "mp_size": 8,
72 |        "arg_mappings": {
73 |        "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
74 |        "gradient_accumulation_steps ": "--gradient_accumulation_steps"
75 |      }
76 |    }
77 | }
78 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/enwik8.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   #"data_path": "/var/nfs/data/enwik8/enwik8_text_document",
 4 |   #"data_path": "/data/enwik8/enwik8/enwik8_text_document",
 5 |   "data_path": "./dataset/enwik8/enwik8/enwik8_text_document",
 6 | 
 7 |   # or for weighted datasets:
 8 |   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 9 |   # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
10 |   # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
11 |   # "train-data-weights": [1., 2.],
12 |   # "test-data-weights": [2., 1.],
13 |   # "valid-data-weights": [0.5, 0.4],
14 | 
15 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
16 |   # WARNING: setting this to True will override any user provided weights
17 |   # "weight_by_num_documents": false,
18 |   # "weighted_sampler_alpha": 0.3,
19 | 
20 |   "vocab_file": "./dataset/enwik8/gpt2-vocab.json",
21 |   "merge_file": "./dataset/enwik8/gpt2-merges.txt",
22 | 
23 |   #  "save": "checkpoints",
24 |   #  "load": "checkpoints",
25 |   "checkpoint_validation_with_forward_pass": False,
26 | 
27 |   "tensorboard_dir": "tensorboard",
28 |   "log_dir": "./logs",
29 |   "use_wandb": True,
30 |   "wandb_host": "https://api.wandb.ai",
31 |   "wandb_project": "neox"
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/etri_cluster.yml:
--------------------------------------------------------------------------------
 1 | # Configurations for using ETRI GPU cluster
 2 | {
 3 |   "launcher": "pdsh",
 4 |   #"launcher": "openmpi",
 5 |   #"deepspeed_mpi": true,
 6 |   #"gradient_accumulation_steps": 8,
 7 |   #"global_num_gpus": 16,
 8 |   "hostfile": "./hostfile",
 9 | }
10 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/finetuning_configs/6-9B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # finetuning option
 3 |   "load": "/path/to/checkpoint",
 4 |   "finetune": true,
 5 | 
 6 |   "pipe-parallel-size": 1,
 7 |   "model-parallel-size": 2,
 8 | 
 9 |    "num-layers": 32,
10 |    "hidden-size": 4096,
11 |    "num-attention-heads": 32,
12 |    "seq-length": 2048,
13 |    "max-position-embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos-emb": "rotary",
16 |    "rotary_pct": 0.25,
17 |    "no-weight-tying": true,
18 |    "gpt_j_residual": true,
19 |    "output_layer_parallelism": "column",
20 |    
21 |    "attention-config": [[["flash"], 32]],
22 |    
23 |    "scaled-upper-triang-masked-softmax-fusion": true,
24 |    "bias-gelu-fusion": true,
25 | 
26 | 
27 |    "optimizer": {
28 |      "type": "Adam",
29 |      "params": {
30 |        "lr": 0.00012,
31 |        "betas": [0.9, 0.95],
32 |        "eps": 1.0e-8
33 |      }
34 |    },
35 |    
36 |    "min_lr": 0.000012,
37 | 
38 |    "zero_optimization": {
39 |     "stage": 1,
40 |     "allgather_partitions": true,
41 |     "allgather_bucket_size": 1260000000,
42 |     "overlap_comm": true,
43 |     "reduce_scatter": true,
44 |     "reduce_bucket_size": 1260000000,
45 |     "contiguous_gradients": true,
46 |     "cpu_offload": false
47 |     "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
48 |   },
49 | 
50 |    "train_micro_batch_size_per_gpu": 8,
51 |    "gradient_accumulation_steps": 2,
52 |    "data-impl": "mmap",
53 | 
54 |    "checkpoint-activations": true,
55 |    "checkpoint-num-layers": 1,
56 |    "partition-activations": true,
57 |    "synchronize-each-layer": true,
58 | 
59 |    "gradient_clipping": 1.0,
60 |    "weight-decay": 0.1,
61 |    "hidden-dropout": 0,
62 |    "attention-dropout": 0,
63 | 
64 |    "fp16": {
65 |      "fp16": true,
66 |      "enabled": true,
67 |      "loss_scale": 0,
68 |      "loss_scale_window": 1000,
69 |      "initial_scale_power": 12,
70 |      "hysteresis": 2,
71 |      "min_loss_scale": 1
72 |    },
73 | 
74 |    "train-iters": 143000,
75 |    "lr-decay-iters": 143000,
76 |    "distributed-backend": "nccl",
77 |    "lr-decay-style": "cosine",
78 |    "warmup": 0.01,
79 |    "checkpoint-factor": 1000,
80 |    "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
81 |    "eval-interval": 143000,
82 |    "eval-iters": 10,
83 | 
84 |    "log-interval": 10,
85 |    "steps_per_print": 10,
86 |    "wall_clock_breakdown": true,
87 | 
88 |    "tokenizer_type": "HFTokenizer"
89 | }
90 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/llama/13B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 2,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 40,
 8 |   "hidden_size": 5120,
 9 |   "num_attention_heads": 40,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "mlp_type": "llama",
25 |   "activation": "silu",
26 | }
27 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/llama/30B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 4,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 60,
 8 |   "hidden_size": 6656,
 9 |   "num_attention_heads": 52,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "mlp_type": "llama",
25 |   "activation": "silu",
26 | }
27 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/llama/65B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 8,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 80,
 8 |   "hidden_size": 8192,
 9 |   "num_attention_heads": 64,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "mlp_type": "llama",
25 |   "activation": "silu",
26 | }
27 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/llama/7B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 |   "make_vocab_size_divisible_by": 1,
 5 | 
 6 |   # model settings
 7 |   "num_layers": 32,
 8 |   "hidden_size": 4096,
 9 |   "num_attention_heads": 32,
10 |   "seq_length": 2048,
11 |   "max_position_embeddings": 2048,
12 |   "pos_emb": "rotary",
13 |   "rotary_pct": 1,
14 |   "no_weight_tying": true,
15 |   "gpt_j_residual": false,
16 |   "output_layer_parallelism": "column",
17 |   "norm": "rmsnorm",
18 |   "rms_norm_epsilon": 1.0e-6,
19 | 
20 |   "scaled_upper_triang_masked_softmax_fusion": true,
21 |   "bias_gelu_fusion": false,
22 |   "use_bias_in_norms": false,
23 |   "use_bias_in_attn_linear": false,
24 |   "mlp_type": "llama",
25 |   "activation": "silu",
26 | }
27 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/llama/README.md:
--------------------------------------------------------------------------------
 1 | # LLaMA
 2 | 
 3 | ## Training and Finetuning
 4 | 
 5 | These configs contain the architecture settings required to run inference/training/finetuning on the [LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama) model suite.
 6 | 
 7 | LLaMA finetuning  can be launched with
 8 | ```sh
 9 | python ./deepy.py ./train.py -d configs llama/7B.yml llama/train_config.yml local_setup.yml
10 | ```
11 | 
12 | If training from scratch, set `finetune=False` in `./configs/llama/train_config.yml`.
13 | 
14 | 
15 | ## Inference
16 | 
17 | 
18 | LLaMA generation can be launched with
19 | ```sh
20 | python ./deepy.py ./generate.py -d configs  \
21 |   llama/7B.yml llama/train_config.yml local_setup.yml text_generation.yml \
22 |   -i input_prompt.txt -o prompt_out.txt
23 | ```
24 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/llama/train_config.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   # finetuning option
 3 |   "finetune": true,
 4 | 
 5 |   # init methods
 6 |   "init_method": "small_init",
 7 |   "output_layer_init_method": "wang_init",
 8 | 
 9 |   # optimizer settings
10 |   "optimizer": {
11 |     "type": "Adam",
12 |     "params": {
13 |      "lr": 0.0002,
14 |      "betas": [0.9, 0.95],
15 |      "eps":  1.0e-8,
16 |     }
17 |   },
18 |   "min_lr": 0.00002,
19 |   "override_lr_scheduler": true,
20 | 
21 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
22 |    "zero_optimization": {
23 |    "stage": 1,
24 |    "allgather_partitions": True,
25 |    "allgather_bucket_size": 500000000,
26 |    "overlap_comm": True,
27 |    "reduce_scatter": True,
28 |    "reduce_bucket_size": 500000000,
29 |    "contiguous_gradients": True,
30 |   },
31 | 
32 |   # batch / data settings
33 |   "train_micro_batch_size_per_gpu": 4,
34 |   "data_impl": "mmap",
35 | 
36 |   # activation checkpointing
37 |   "checkpoint_activations": true,
38 |   "checkpoint_num_layers": 1,
39 |   "partition_activations": true,
40 |   "synchronize_each_layer": true,
41 | 
42 |   # regularization
43 |   "gradient_clipping": 1.0,
44 |   "weight_decay": 0.1,
45 |   "hidden_dropout": 0,
46 |   "attention_dropout": 0,
47 | 
48 |   # precision settings
49 |   "fp16": {
50 |     "fp16": true,
51 |     "enabled": true,
52 |     "loss_scale": 0,
53 |     "loss_scale_window": 1000,
54 |     "hysteresis": 2,
55 |     "min_loss_scale": 1
56 |   },
57 | 
58 |   # misc. training settings
59 |   "train_iters": 320000,
60 |   "lr_decay_iters": 320000,
61 |   "distributed_backend": "nccl",
62 |   "lr_decay_style": "cosine",
63 |   "warmup": 0.01,
64 |   "checkpoint_factor": 10000,
65 |   "eval_interval": 1000,
66 |   "eval_iters": 10,
67 | 
68 |   # logging
69 |   "log_interval": 100,
70 |   "steps_per_print": 10,
71 |   "keep_last_n_checkpoints": 4,
72 |   "wall_clock_breakdown": true,
73 | }
74 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/125M-json.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 12,
 6 |   "hidden_size": 768,
 7 |   "num_attention_heads": 12,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "norm": "layernorm",
11 |   "pos_emb": "rotary",
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": false,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": false,
17 |   "bias_gelu_fusion": false,
18 | 
19 |   "init_method": "small_init",
20 |   "output_layer_init_method": "wang_init",
21 | 
22 |   "optimizer": {
23 |     "type": "Adam",
24 |     "params": {
25 |       "lr": 0.0006,
26 |       "betas": [0.9, 0.95],
27 |       "eps": 1.0e-8
28 |     }
29 |   },
30 |   "min_lr": 0.00006,
31 | 
32 |   "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": true
40 |   },
41 | 
42 |   "train_micro_batch_size_per_gpu": 4,
43 |   "data_impl": "mmap",
44 | 
45 |   "checkpoint_activations": true,
46 |   "checkpoint_num_layers": 1,
47 |   "partition_activations": true,
48 |   "synchronize_each_layer": true,
49 | 
50 |   "gradient_clipping": 1.0,
51 |   "weight_decay": 0.1,
52 |   "hidden_dropout": 0.0,
53 |   "attention_dropout": 0.0,
54 | 
55 |   "fp16": {
56 |     "enabled": true,
57 |     "loss_scale": 0,
58 |     "loss_scale_window": 1000,
59 |     "hysteresis": 2,
60 |     "min_loss_scale": 1
61 |   },
62 | 
63 |   "train_iters": 320000,
64 |   "lr_decay_iters": 320000,
65 |   "distributed_backend": "nccl",
66 |   "lr_decay_style": "cosine",
67 |   "warmup": 0.01,
68 |   "checkpoint_factor": 10000,
69 |   "eval_interval": 1000,
70 |   "eval_iters": 10,
71 | 
72 |   "log_interval": 100,
73 |   "steps_per_print": 10,
74 |   "keep_last_n_checkpoints": 4,
75 |   "wall_clock_breakdown": true,
76 | 
77 |   "hostfile": "/mock_path"
78 | }
79 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/800M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   # model settings
 6 |   "num_layers": 16,
 7 |   "hidden_size": 2048,
 8 |   "num_attention_heads": 8,
 9 |   "seq_length": 2048,
10 |   "max_position_embeddings": 2048,
11 |   "pos_emb": "rotary",
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": false,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": false,
17 |   "bias_gelu_fusion": false,
18 | 
19 |   # init methods
20 |   "init_method": "small_init",
21 |   "output_layer_init_method": "wang_init",
22 | 
23 |   "optimizer": {
24 |     "type": "Adam",
25 |     "params": {
26 |       "lr": 0.00025,
27 |       "betas": [0.9, 0.95],
28 |       "eps": 1.0e-8,
29 |     }
30 |   },
31 |   "min_lr": 0.000025,
32 | 
33 |   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": True,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": True,
39 |     "reduce_scatter": True,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": True,
42 |   },
43 | 
44 |   "train_micro_batch_size_per_gpu": 16,
45 |   "gas": 1,
46 |   "data_impl": "mmap",
47 |   "num_workers": 1,
48 | 
49 |   # activation checkpointing
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   # regularization
56 |   "gradient_clipping": 1.0,
57 |   "weight_decay": 0.1,
58 |   "hidden_dropout": 0,
59 |   "attention_dropout": 0,
60 | 
61 |   # precision settings
62 |   "fp16": {
63 |     "fp16": true,
64 |     "enabled": true,
65 |     "loss_scale": 0,
66 |     "loss_scale_window": 1000,
67 |     "initial_scale_power": 12,
68 |     "hysteresis": 2,
69 |     "min_loss_scale": 1,
70 |   },
71 | 
72 |   "train_iters": 143000,
73 |   "lr_decay_iters": 143000,
74 |   "distributed_backend": "nccl",
75 |   "lr_decay_style": "cosine",
76 |   "warmup": 0.01,
77 |   "checkpoint_factor": 1000,
78 |   "eval_interval": 40000,
79 |   "eval_iters": 10,
80 | 
81 |   "log_interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | }
85 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/bf16_125M.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 | 
 8 |    # model settings
 9 |    "num_layers": 12,
10 |    "hidden_size": 768,
11 |    "num_attention_heads": 12,
12 |    "seq_length": 2048,
13 |    "max_position_embeddings": 2048,
14 |    "norm": "layernorm",
15 |    "pos_emb": "rotary",
16 |    "no_weight_tying": true,
17 | 
18 |    # these should provide some speedup but takes a while to build, set to true if desired
19 |    "scaled_upper_triang_masked_softmax_fusion": false,
20 |    "bias_gelu_fusion": false,
21 | 
22 | 
23 |    # optimizer settings
24 |    "optimizer": {
25 |      "type": "Adam",
26 |      "params": {
27 |        "lr": 0.0006,
28 |        "betas": [0.9, 0.999],
29 |        "eps": 1.0e-8,
30 |      }
31 |    },
32 |    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
33 |    "zero_optimization": {
34 |     "stage": 0,
35 |     "allgather_partitions": True,
36 |     "allgather_bucket_size": 500000000,
37 |     "overlap_comm": True,
38 |     "reduce_scatter": True,
39 |     "reduce_bucket_size": 500000000,
40 |     "contiguous_gradients": True,
41 |   },
42 | 
43 |    # batch / data settings
44 |    "train_micro_batch_size_per_gpu": 4,
45 |    "data_impl": "mmap",
46 |    "split": "949,50,1",
47 | 
48 |    # activation checkpointing
49 |    "checkpoint_activations": true,
50 |    "checkpoint_num_layers": 1,
51 |    "partition_activations": true,
52 |    "synchronize_each_layer": true,
53 | 
54 |    # regularization
55 |    "gradient_clipping": 1.0,
56 |    "weight_decay": 0.0,
57 |    "hidden_dropout": 0.0,
58 |    "attention_dropout": 0.0,
59 | 
60 |    "precision": "bfloat16",
61 | 
62 |    "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
63 |    # misc. training settings
64 |    "train_iters": 320000,
65 |    "lr_decay_iters": 320000,
66 |    "distributed_backend": "nccl",
67 |    "lr_decay_style": "cosine",
68 |    "warmup": 0.01,
69 |    "checkpoint_factor": 10000,
70 |    "eval_interval": 1000,
71 |    "eval_iters": 10,
72 | 
73 |    # logging
74 |    "log_interval": 100,
75 |    "steps_per_print": 10,
76 |    "keep_last_n_checkpoints": 4,
77 |    "wall_clock_breakdown": true,
78 | }
79 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/cpu_mock_config.yml:
--------------------------------------------------------------------------------
1 | # CPU unit tests should be independent of the presence of GPUs on the test server
2 | # host. This configuration mocks these GPU resources and other dependencies.
3 | {
4 |   "global_num_gpus": 1
5 | }
6 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/eleutherai_cluster.yml:
--------------------------------------------------------------------------------
 1 | # Data paths and options when using EleutherAI cluster
 2 | {
 3 |   # you may include multiple distinct datasets if desired
 4 |   "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
 5 |   "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
 6 |   "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
 7 | 
 8 |   # if using multiple datasets, provide weights for them to be sampled with
 9 |   # "train-data-weights": [1., 2.],
10 |   # "test-data-weights": [2., 1.],
11 |   # "valid-data-weights": [0.5, 0.4],
12 | 
13 | 
14 |   # If you would like the code to create val and test datasets from your training set use the following instead
15 |   # "split" determines the relative size of train, val, and test
16 | 
17 |   # "split" 995,4,1
18 |   # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
19 | 
20 |   "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
21 |   "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
22 |   "save": "/mnt/ssd-1/checkpoints",
23 |   "load": "/mnt/ssd-1/checkpoints",
24 |   "tensorboard_dir": "/mnt/ssd-1/tensorboard",
25 |   "log_dir": "/mnt/ssd-1/logs",
26 |   "wandb_team": "eleutherai",
27 |   "wandb_project": "neox",
28 |   "wandb_group": "example"
29 | }
30 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/gmlp_small.yml:
--------------------------------------------------------------------------------
 1 | # GPT-2 pretraining setup
 2 | {
 3 |    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
 4 |    # across the node boundaries )
 5 |    "pipe_parallel_size": 1,
 6 |    "model_parallel_size": 1,
 7 |    "attention_config": [[["gmlp"], "all"]],
 8 | 
 9 | 
10 |    # model settings
11 |    "num_layers": 12,
12 |    "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
13 |    "gmlp_attn_dim": 64,
14 |    "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
15 |    "seq_length": 2048,
16 |    "max_position_embeddings": 2048,
17 |    "norm": "layernorm",
18 |    "pos_emb": "none",
19 |    "no_weight_tying": true,
20 | 
21 |    # optimizer settings
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.0006,
26 |        "betas": [0.9, 0.999],
27 |        "eps": 1.0e_8,
28 |      }
29 |    },
30 | 
31 |    # batch / data settings
32 |    "train_micro_batch_size_per_gpu": 4,
33 |    "data_impl": "mmap",
34 |    "split": "949,50,1",
35 | 
36 |    # activation checkpointing
37 |    "checkpoint_activations": true,
38 |    "checkpoint_num_layers": 1,
39 |    "partition_activations": false,
40 |    "synchronize_each_layer": true,
41 | 
42 |    # regularization
43 |    "gradient_clipping": 1.0,
44 |    "weight_decay": 0.1,
45 |    "hidden_dropout": 0.0,
46 |    "attention_dropout": 0.0,
47 | 
48 |    # precision settings
49 |    "fp16": {
50 |      "enabled": true,
51 |      "loss_scale": 0,
52 |      "loss_scale_window": 1000,
53 |      "hysteresis": 2,
54 |      "min_loss_scale": 1
55 |    },
56 | 
57 |    # misc. training settings
58 |    "train_iters": 320000,
59 |    "lr_decay_iters": 320000,
60 |    "distributed_backend": "nccl",
61 |    "lr_decay_style": "cosine",
62 |    "warmup": 0.01,
63 |    "checkpoint_factor": 10000,
64 |    "eval_interval": 1000,
65 |    "eval_iters": 10,
66 | 
67 |    # logging
68 |    "log_interval": 100,
69 |    "steps_per_print": 10,
70 |    "keep_last_n_checkpoints": 4,
71 |    "wall_clock_breakdown": true,
72 | }
73 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/local_setup.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   #"data_path": "/var/nfs/data/enwik8/enwik8_text_document",
 4 |   "data_path": "/gpt-neox/data/enwik8/enwik8_text_document",
 5 | 
 6 |   # or for weighted datasets:
 7 |   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 8 |   # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
 9 |   # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
10 |   # "train-data-weights": [1., 2.],
11 |   # "test-data-weights": [2., 1.],
12 |   # "valid-data-weights": [0.5, 0.4],
13 | 
14 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
15 |   # WARNING: setting this to True will override any user provided weights
16 |   # "weight_by_num_documents": false,
17 |   # "weighted_sampler_alpha": 0.3,
18 | 
19 |   "vocab_file": "/gpt-neox/data/gpt2-vocab.json",
20 |   "merge_file": "/gpt-neox/data/gpt2-merges.txt",
21 | 
22 |   "save": "checkpoints",
23 |   "load": "checkpoints",
24 |   "checkpoint_validation_with_forward_pass": False,
25 | 
26 |   "tensorboard_dir": "tensorboard",
27 |   "log_dir": "./logs",
28 |   "use_wandb": True,
29 |   "wandb_host": "https://api.wandb.ai",
30 |   "wandb_project": "neox"
31 | }
32 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/slurm_125M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 1,
 3 |    "model_parallel_size": 1,
 4 |    "num_layers": 12,
 5 |    "hidden_size": 768,
 6 |    "num_attention_heads": 12,
 7 |    "seq_length": 2048,
 8 |    "max_position_embeddings": 2048,
 9 |    "norm": "layernorm",
10 |    "pos_emb": "rotary",
11 |    "no_weight_tying": true,
12 |    "scaled_upper_triang_masked_softmax_fusion": true,
13 |    "bias_gelu_fusion": true,
14 |    "optimizer": {
15 |      "type": "Adam",
16 |      "params": {
17 |        "lr": 0.0006,
18 |        "betas": [0.9, 0.999],
19 |        "eps": 1.0e-8
20 |      }
21 |    },
22 |    "zero_optimization": {
23 |     "stage": 0,
24 |     "allgather_partitions": true,
25 |     "allgather_bucket_size": 500000000,
26 |     "overlap_comm": true,
27 |     "reduce_scatter": true,
28 |     "reduce_bucket_size": 500000000,
29 |     "contiguous_gradients": true
30 |   },
31 |    "train_micro_batch_size_per_gpu": 4,
32 |    "data_impl": "mmap",
33 |    "split": "949,50,1",
34 |    "checkpoint_activations": true,
35 |    "checkpoint_num_layers": 1,
36 |    "partition_activations": true,
37 |    "synchronize_each_layer": true,
38 |    "gradient_clipping": 1.0,
39 |    "weight_decay": 0.0,
40 |    "hidden_dropout": 0.0,
41 |    "attention_dropout": 0.0,
42 |    "fp16": {
43 |      "enabled": true,
44 |      "loss_scale": 0,
45 |      "loss_scale_window": 1000,
46 |      "hysteresis": 2,
47 |      "min_loss_scale": 1
48 |    },
49 |    "train_iters": 320000,
50 |    "lr_decay_iters": 320000,
51 |    "distributed_backend": "nccl",
52 |    "lr_decay_style": "cosine",
53 |    "warmup": 0.01,
54 |    "checkpoint_factor": 10000,
55 |    "eval_interval": 1000,
56 |    "eval_iters": 10,
57 |    "log_interval": 100,
58 |    "steps_per_print": 10,
59 |    "keep_last_n_checkpoints": 4,
60 |    "wall_clock_breakdown": true,
61 |    "launcher": "slurm",
62 |    "deepspeed_slurm": true,
63 |    "comment": "neox"
64 | }
65 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/slurm_local.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_path": "data/enwik8/enwik8_text_document",
 3 |   "vocab_file": "data/gpt2-vocab.json",
 4 |   "merge_file": "data/gpt2-merges.txt",
 5 |   "save": "checkpoints",
 6 |   "checkpoint_validation_with_forward_pass": false,
 7 |   "tensorboard_dir": "tensorboard",
 8 |   "log_dir": "logs",
 9 |   "use_wandb": true,
10 |   "wandb_host": "https://api.wandb.ai",
11 |   "wandb_project": "neox"
12 | }
13 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/sparse.yml:
--------------------------------------------------------------------------------
 1 | # Add this to your config for sparse attention every other layer
 2 | {
 3 |   "attention_config": [[["local", "global"], "all"]],
 4 | 
 5 |   # sparsity config:
 6 |   # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
 7 |   # illustrative purposes)
 8 |   # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
 9 |   # more detailed config instructions and available parameters
10 | 
11 |   "sparsity_config": {
12 |     "block": 16, # block size
13 |     "num_local_blocks": 32,
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/org/text_generation.yml:
--------------------------------------------------------------------------------
 1 | # Parameters used for text generation
 2 | # Make sure `load` is specified somewhere else
 3 | {
 4 |   # Text gen type: `input-file`, `unconditional` or `interactive`
 5 |   "text_gen_type": "unconditional",
 6 | 
 7 |   # Params for all
 8 |   "maximum_tokens": 102,
 9 |   "prompt_end": "\n",
10 |   "temperature": 1.0,
11 |   "top_p": 0.0,
12 |   "top_k": 0,
13 |   "recompute": false,
14 | 
15 |   # `unconditional`: samples
16 |   "num_samples": 10,
17 | 
18 |   # input/output file
19 |   "sample_input_file": "sample_input.txt",
20 |   "sample_output_file": "sample_output.txt",
21 | }
22 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pile.yml:
--------------------------------------------------------------------------------
 1 | # Suggested data paths when using GPT-NeoX locally
 2 | {
 3 |   "data_path": "/data/pile/pile_text_document",
 4 | 
 5 |   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
 6 |   # WARNING: setting this to True will override any user provided weights
 7 |   # "weight_by_num_documents": false,
 8 |   # "weighted_sampler_alpha": 0.3,
 9 | 
10 |   "tokenizer_type": "HFTokenizer",
11 |   "vocab_file": "/data/pile/20B_tokenizer.json",
12 | 
13 |   #  "save": "checkpoints",
14 |   #  "load": "checkpoints",
15 |   "checkpoint_validation_with_forward_pass": False,
16 | 
17 |   "tensorboard_dir": "tensorboard",
18 |   "log_dir": "./logs",
19 |   "use_wandb": True,
20 |   "wandb_host": "https://api.wandb.ai",
21 |   "wandb_project": "neox"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/1-4B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 24,
 6 |   "hidden_size": 2048,
 7 |   "num_attention_heads": 16,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 24]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0002,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00002,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 16,
46 |   "gas": 1,
47 |   "data_impl": "mmap",
48 |   "num_workers": 1,
49 | 
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   "gradient_clipping": 1.0,
56 |   "weight_decay": 0.1,
57 |   "hidden_dropout": 0,
58 |   "attention_dropout": 0,
59 | 
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   "train_iters": 143000,
71 |   "lr_decay_iters": 143000,
72 |   "distributed_backend": "nccl",
73 |   "lr_decay_style": "cosine",
74 |   "warmup": 0.01,
75 |   "checkpoint_factor": 1000,
76 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 |   "eval_interval": 143000,
78 |   "eval_iters": 10,
79 | 
80 | 
81 |   "log_interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 |   "tokenizer_type": "HFTokenizer"
85 |   }
86 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/12B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 1,
 3 |    "model_parallel_size": 4,
 4 | 
 5 |    "num_layers": 36,
 6 |    "hidden_size": 5120,
 7 |    "num_attention_heads": 40,
 8 |    "seq_length": 2048,
 9 |    "max_position_embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos_emb": "rotary",
12 |    "rotary_pct": 0.25,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": true,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], 36]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": true,
21 | 
22 |    "optimizer": {
23 |      "type": "Adam",
24 |      "params": {
25 |        "lr": 0.00012,
26 |        "betas": [0.9, 0.95],
27 |        "eps": 1.0e-8
28 |      }
29 |    },
30 |    "min_lr": 0.000012,
31 | 
32 |    "zero_optimization": {
33 |     "stage": 1,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 1260000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 1260000000,
39 |     "contiguous_gradients": true,
40 |     "cpu_offload": false
41 |   },
42 | 
43 |    "train_micro_batch_size_per_gpu": 8,
44 |    "gradient_accumulation_steps": 2,
45 |    "data_impl": "mmap",
46 | 
47 |    "checkpoint_activations": true,
48 |    "checkpoint_num_layers": 1,
49 |    "partition_activations": true,
50 |    "synchronize_each_layer": true,
51 | 
52 |    "gradient_clipping": 1.0,
53 |    "weight_decay": 0.1,
54 |    "hidden_dropout": 0,
55 |    "attention_dropout": 0,
56 | 
57 |    "fp16": {
58 |      "fp16": true,
59 |      "enabled": true,
60 |      "loss_scale": 0,
61 |      "loss_scale_window": 1000,
62 |      "initial_scale_power": 12,
63 |      "hysteresis": 2,
64 |      "min_loss_scale": 1
65 |    },
66 | 
67 |    "train_iters": 143000,
68 |    "lr_decay_iters": 143000,
69 |    "distributed_backend": "nccl",
70 |    "lr_decay_style": "cosine",
71 |    "warmup": 0.01,
72 |    "checkpoint_factor": 1000,
73 |    "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
74 |    "eval_interval": 143000,
75 |    "eval_iters": 10,
76 | 
77 |    "log_interval": 10,
78 |    "steps_per_print": 10,
79 |    "wall_clock_breakdown": true,
80 | 
81 |    "log_grad_norm": true,
82 | 
83 |    "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/160M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 12,
 6 |   "hidden_size": 768,
 7 |   "num_attention_heads": 12,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 12]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0006,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00006,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 32,
46 |   "gas": 1,
47 |   "data_impl": "mmap",
48 |   "num_workers": 1,
49 | 
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   "gradient_clipping": 1.0,
56 |   "weight_decay": 0.1,
57 |   "hidden_dropout": 0,
58 |   "attention_dropout": 0,
59 | 
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   "train_iters": 143000,
71 |   "lr_decay_iters": 143000,
72 |   "distributed_backend": "nccl",
73 |   "lr_decay_style": "cosine",
74 |   "warmup": 0.01,
75 |   "checkpoint_factor": 1000,
76 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 |   "eval_interval": 143000,
78 |   "eval_iters": 10,
79 | 
80 |   "log_interval": 10,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   "tokenizer_type": "HFTokenizer"
85 | }
86 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/1B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 16,
 6 |   "hidden_size": 2048,
 7 |   "num_attention_heads": 8,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "scaled_upper_triang_masked_softmax_fusion": true,
17 |   "bias_gelu_fusion": true,
18 | 
19 |   "init_method": "small_init",
20 |   "output_layer_init_method": "wang_init",
21 | 
22 |   "optimizer": {
23 |     "type": "Adam",
24 |     "params": {
25 |       "lr": 0.00025,
26 |       "betas": [0.9, 0.95],
27 |       "eps": 1.0e-8
28 |     }
29 |   },
30 |   "min_lr": 0.000025,
31 | 
32 |   "zero_optimization": {
33 |     "stage": 0,
34 |     "allgather_partitions": true,
35 |     "allgather_bucket_size": 500000000,
36 |     "overlap_comm": true,
37 |     "reduce_scatter": true,
38 |     "reduce_bucket_size": 500000000,
39 |     "contiguous_gradients": true,
40 |     "cpu_offload": false
41 |   },
42 | 
43 |   "fp16": {
44 |     "enabled": true,
45 |     "type": "bfloat16",
46 |     "auto_cast": true,
47 |     "loss_scale": 0,
48 |     "loss_scale_window": 1000,
49 |     "initial_scale_power": 12,
50 |     "hysteresis": 2,
51 |     "min_loss_scale": 1
52 |   },
53 | 
54 |   "fp32_allreduce": true,
55 | 
56 |   "train_micro_batch_size_per_gpu": 4,
57 |   "gradient_accumulation_steps": 4,
58 |   "data_impl": "mmap",
59 |   "num_workers": 1,
60 | 
61 |   "checkpoint_activations": true,
62 |   "checkpoint_num_layers": 1,
63 |   "partition_activations": true,
64 |   "synchronize_each_layer": true,
65 | 
66 |   "gradient_clipping": 1.0,
67 |   "weight_decay": 0.1,
68 |   "hidden_dropout": 0,
69 |   "attention_dropout": 0,
70 | 
71 |   "train_iters": 143000,
72 |   "lr_decay_iters": 143000,
73 |   "distributed_backend": "nccl",
74 |   "lr_decay_style": "cosine",
75 |   "warmup": 0.01,
76 |   "checkpoint_factor": 1000,
77 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
78 |   "eval_interval": 143000,
79 |   "eval_iters": 10,
80 | 
81 |   "log_interval": 10,
82 |   "steps_per_print": 10,
83 |   "wall_clock_breakdown": true,
84 | 
85 |   "tokenizer_type": "HFTokenizer"
86 | }
87 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/2-8B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 32,
 6 |   "hidden_size": 2560,
 7 |   "num_attention_heads": 32,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 32]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.00016,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.000016,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 8,
46 |   "gradient_accumulation_steps": 2,
47 |   "data_impl": "mmap",
48 |   "num_workers": 1,
49 | 
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   "gradient_clipping": 1.0,
56 |   "weight_decay": 0.1,
57 |   "hidden_dropout": 0,
58 |   "attention_dropout": 0,
59 | 
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   "train_iters": 143000,
71 |   "lr_decay_iters": 143000,
72 |   "distributed_backend": "nccl",
73 |   "lr_decay_style": "cosine",
74 |   "warmup": 0.01,
75 |   "checkpoint_factor": 1000,
76 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 |   "eval_interval": 40000,
78 |   "eval_iters": 10,
79 | 
80 |   "log_grad_norm": true,
81 | 
82 |   "log_interval": 10,
83 |   "steps_per_print": 10,
84 |   "wall_clock_breakdown": true,
85 | 
86 |   "tokenizer_type": "HFTokenizer"
87 | }
88 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/410M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 24,
 6 |   "hidden_size": 1024,
 7 |   "num_attention_heads": 16,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 24]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.0003,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.00003,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 32,
46 |   "gas": 1,
47 |   "data_impl": "mmap",
48 |   "num_workers": 1,
49 | 
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   "gradient_clipping": 1.0,
56 |   "weight_decay": 0.1,
57 |   "hidden_dropout": 0,
58 |   "attention_dropout": 0,
59 | 
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   "train_iters": 143000,
71 |   "lr_decay_iters": 143000,
72 |   "distributed_backend": "nccl",
73 |   "lr_decay_style": "cosine",
74 |   "warmup": 0.01,
75 |   "checkpoint_factor": 1000,
76 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 |   "eval_interval": 143000,
78 |   "eval_iters": 10,
79 | 
80 |   "log_interval": 10,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   "tokenizer_type": "HFTokenizer"
85 | }
86 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/6-9B.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "pipe_parallel_size": 1,
 3 |    "model_parallel_size": 2,
 4 | 
 5 |    "num_layers": 32,
 6 |    "hidden_size": 4096,
 7 |    "num_attention_heads": 32,
 8 |    "seq_length": 2048,
 9 |    "max_position_embeddings": 2048,
10 |    "norm": "layernorm",
11 |    "pos_emb": "rotary",
12 |    "rotary_pct": 0.25,
13 |    "no_weight_tying": true,
14 |    "gpt_j_residual": true,
15 |    "output_layer_parallelism": "column",
16 | 
17 |    "attention_config": [[["flash"], 32]],
18 | 
19 |    "scaled_upper_triang_masked_softmax_fusion": true,
20 |    "bias_gelu_fusion": true,
21 | 
22 | 
23 |    "optimizer": {
24 |      "type": "Adam",
25 |      "params": {
26 |        "lr": 0.00012,
27 |        "betas": [0.9, 0.95],
28 |        "eps": 1.0e-8
29 |      }
30 |    },
31 | 
32 |    "min_lr": 0.000012,
33 | 
34 |    "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 1260000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 1260000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |    "train_micro_batch_size_per_gpu": 8,
46 |    "gradient_accumulation_steps": 2,
47 |    "data_impl": "mmap",
48 | 
49 |    "checkpoint_activations": true,
50 |    "checkpoint_num_layers": 1,
51 |    "partition_activations": true,
52 |    "synchronize_each_layer": true,
53 | 
54 |    "gradient_clipping": 1.0,
55 |    "weight_decay": 0.1,
56 |    "hidden_dropout": 0,
57 |    "attention_dropout": 0,
58 | 
59 |    "fp16": {
60 |      "fp16": true,
61 |      "enabled": true,
62 |      "loss_scale": 0,
63 |      "loss_scale_window": 1000,
64 |      "initial_scale_power": 12,
65 |      "hysteresis": 2,
66 |      "min_loss_scale": 1
67 |    },
68 | 
69 |    "train_iters": 143000,
70 |    "lr_decay_iters": 143000,
71 |    "distributed_backend": "nccl",
72 |    "lr_decay_style": "cosine",
73 |    "warmup": 0.01,
74 |    "checkpoint_factor": 1000,
75 |    "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
76 |    "eval_interval": 143000,
77 |    "eval_iters": 10,
78 | 
79 |    "log_interval": 10,
80 |    "steps_per_print": 10,
81 |    "wall_clock_breakdown": true,
82 | 
83 |    "tokenizer_type": "HFTokenizer"
84 | }
85 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/pythia/70M.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pipe_parallel_size": 1,
 3 |   "model_parallel_size": 1,
 4 | 
 5 |   "num_layers": 6,
 6 |   "hidden_size": 512,
 7 |   "num_attention_heads": 8,
 8 |   "seq_length": 2048,
 9 |   "max_position_embeddings": 2048,
10 |   "pos_emb": "rotary",
11 |   "rotary_pct": 0.25,
12 |   "no_weight_tying": true,
13 |   "gpt_j_residual": true,
14 |   "output_layer_parallelism": "column",
15 | 
16 |   "attention_config": [[["flash"], 6]],
17 | 
18 |   "scaled_upper_triang_masked_softmax_fusion": true,
19 |   "bias_gelu_fusion": true,
20 | 
21 |   "init_method": "small_init",
22 |   "output_layer_init_method": "wang_init",
23 | 
24 |   "optimizer": {
25 |     "type": "Adam",
26 |     "params": {
27 |       "lr": 0.001,
28 |       "betas": [0.9, 0.95],
29 |       "eps": 1.0e-8
30 |     }
31 |   },
32 |   "min_lr": 0.0001,
33 | 
34 |   "zero_optimization": {
35 |     "stage": 1,
36 |     "allgather_partitions": true,
37 |     "allgather_bucket_size": 500000000,
38 |     "overlap_comm": true,
39 |     "reduce_scatter": true,
40 |     "reduce_bucket_size": 500000000,
41 |     "contiguous_gradients": true,
42 |     "cpu_offload": false
43 |   },
44 | 
45 |   "train_micro_batch_size_per_gpu": 32,
46 |   "gas": 1,
47 |   "data_impl": "mmap",
48 |   "num_workers": 1,
49 | 
50 |   "checkpoint_activations": true,
51 |   "checkpoint_num_layers": 1,
52 |   "partition_activations": true,
53 |   "synchronize_each_layer": true,
54 | 
55 |   "gradient_clipping": 1.0,
56 |   "weight_decay": 0.1,
57 |   "hidden_dropout": 0,
58 |   "attention_dropout": 0,
59 | 
60 |   "fp16": {
61 |     "fp16": true,
62 |     "enabled": true,
63 |     "loss_scale": 0,
64 |     "loss_scale_window": 1000,
65 |     "initial_scale_power": 12,
66 |     "hysteresis": 2,
67 |     "min_loss_scale": 1
68 |   },
69 | 
70 |   "train_iters": 143000,
71 |   "lr_decay_iters": 143000,
72 |   "distributed_backend": "nccl",
73 |   "lr_decay_style": "cosine",
74 |   "warmup": 0.01,
75 |   "checkpoint_factor": 1000,
76 |   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77 |   "eval_interval": 100000,
78 |   "eval_iters": 10,
79 | 
80 |   "log_interval": 10,
81 |   "steps_per_print": 10,
82 |   "wall_clock_breakdown": true,
83 | 
84 |   "tokenizer_type": "HFTokenizer"
85 | }
86 | 


--------------------------------------------------------------------------------
/gpt-neox/configs/slurm_local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "vocab-file": "data/gpt2-vocab.json",
 3 |   "merge-file": "data/gpt2-merges.txt",
 4 |   "save": "checkpoints",
 5 |   "checkpoint_validation_with_forward_pass": false,
 6 |   "tensorboard-dir": "tensorboard",
 7 |   "log-dir": "logs",
 8 |   "use_wandb": true,
 9 |   "wandb_host": "https://api.wandb.ai",
10 |   "wandb_project": "neox"
11 | }
12 | 


--------------------------------------------------------------------------------
/gpt-neox/deepy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) 2021, EleutherAI
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | import os
18 | 
19 | import deepspeed.launcher.runner
20 | 
21 | 
22 | def main():
23 |     logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
24 | 
25 |     from megatron.neox_arguments import NeoXArgs
26 |     from megatron.utils import get_wandb_api_key
27 | 
28 |     neox_args = NeoXArgs.consume_deepy_args()
29 |     deepspeed_main_args = neox_args.get_deepspeed_main_args()
30 | 
31 |     # Extract wandb API key and inject into worker environments
32 |     wandb_token = get_wandb_api_key(neox_args=neox_args)
33 |     if wandb_token is not None:
34 |         deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35 |         os.environ["WANDB_API_KEY"] = wandb_token
36 | 
37 |     deepspeed.launcher.runner.main(deepspeed_main_args)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/gpt-neox/eval_tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .eval_adapter import EvalHarnessAdapter, run_eval_harness
16 | 


--------------------------------------------------------------------------------
/gpt-neox/hostfile:
--------------------------------------------------------------------------------
1 | node01 slots=1
2 | node02 slots=1
3 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | 
16 | 
17 | def print_rank_0(*message):
18 |     """If distributed is initialized print only on rank 0."""
19 |     if torch.distributed.is_initialized():
20 |         if torch.distributed.get_rank() == 0:
21 |             print(*message, flush=True)
22 |     else:
23 |         print(*message, flush=True)
24 | 
25 | 
26 | from .initialize import initialize_megatron
27 | from .neox_arguments import NeoXArgs
28 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/gpt-neox/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/gpt-neox/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import pathlib
17 | import subprocess
18 | 
19 | from torch.utils import cpp_extension
20 | from pathlib import Path
21 | 
22 | srcpath = Path(__file__).parent.absolute()
23 | 
24 | # Setting this param to a list has a problem of generating different
25 | # compilation commands (with different order of architectures) and
26 | # leading to recompilation of fused kernels. Set it to empty string
27 | # to avoid recompilation and assign arch flags explicitly in
28 | # extra_cuda_cflags below
29 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
30 | 
31 | 
32 | def load_fused_kernels():
33 |     try:
34 |         import scaled_upper_triang_masked_softmax_cuda
35 |         import scaled_masked_softmax_cuda
36 |     except (ImportError, ModuleNotFoundError) as e:
37 |         print("\n")
38 |         print(e)
39 |         print("=" * 100)
40 |         print(
41 |             f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
42 |         )
43 |         print("=" * 100)
44 |         exit()
45 |     return
46 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied from NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/fused_kernels/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from torch.utils import cpp_extension
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | from pathlib import Path
 5 | import subprocess
 6 | 
 7 | 
 8 | def _get_cuda_bare_metal_version(cuda_dir):
 9 |     raw_output = subprocess.check_output(
10 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
11 |     )
12 |     output = raw_output.split()
13 |     release_idx = output.index("release") + 1
14 |     release = output[release_idx].split(".")
15 |     bare_metal_major = release[0]
16 |     bare_metal_minor = release[1][0]
17 | 
18 |     return raw_output, bare_metal_major, bare_metal_minor
19 | 
20 | 
21 | srcpath = Path(__file__).parent.absolute()
22 | cc_flag = []
23 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
24 | if int(bare_metal_major) >= 11:
25 |     cc_flag.append("-gencode")
26 |     cc_flag.append("arch=compute_80,code=sm_80")
27 | 
28 | nvcc_flags = [
29 |     "-O3",
30 |     "-gencode",
31 |     "arch=compute_70,code=sm_70",
32 |     "--use_fast_math",
33 |     "-U__CUDA_NO_HALF_OPERATORS__",
34 |     "-U__CUDA_NO_HALF_CONVERSIONS__",
35 |     "--expt-relaxed-constexpr",
36 |     "--expt-extended-lambda",
37 | ]
38 | cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag}
39 | layernorm_cuda_args = {
40 |     "cxx": ["-O3"],
41 |     "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"],
42 | }
43 | setup(
44 |     name="fused_kernels",
45 |     version="0.0.1",
46 |     author="Sid Black & Alejandro Molina et al.",
47 |     author_email="alejandro.molina@aleph-alpha.de",
48 |     include_package_data=False,
49 |     ext_modules=[
50 |         CUDAExtension(
51 |             "scaled_upper_triang_masked_softmax_cuda",
52 |             [
53 |                 str(srcpath / "scaled_upper_triang_masked_softmax.cpp"),
54 |                 str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"),
55 |             ],
56 |             extra_compile_args=cuda_ext_args,
57 |         ),
58 |         CUDAExtension(
59 |             "scaled_masked_softmax_cuda",
60 |             [
61 |                 str(srcpath / "scaled_masked_softmax.cpp"),
62 |                 str(srcpath / "scaled_masked_softmax_cuda.cu"),
63 |             ],
64 |             extra_compile_args=cuda_ext_args,
65 |         ),
66 |     ],
67 |     cmdclass={"build_ext": BuildExtension},
68 | )
69 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/gradient_noise_scale/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_noise_scale import GradientNoiseScale
2 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from .gpt2_model import GPT2ModelPipe
19 | from .utils import get_params_for_weight_decay_optimization
20 | from .word_embeddings import SoftEmbedding
21 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/model/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI contributors
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from typing import Optional
19 | from torch import Tensor
20 | 
21 | # flags required to enable jit fusion kernels
22 | torch._C._jit_set_profiling_mode(False)
23 | torch._C._jit_set_profiling_executor(False)
24 | torch._C._jit_override_can_fuse_on_cpu(True)
25 | torch._C._jit_override_can_fuse_on_gpu(True)
26 | 
27 | 
28 | def bias_dropout_add(
29 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool
30 | ) -> Tensor:
31 |     out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
32 |     if residual is not None:
33 |         out = residual + out
34 |     return out
35 | 
36 | 
37 | def get_bias_dropout_add(training):
38 |     def _bias_dropout_add(x, bias, residual, prob):
39 |         return bias_dropout_add(x, bias, residual, prob, training)
40 | 
41 |     return _bias_dropout_add
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_fused_train(
46 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add(x, bias, residual, prob, True)
49 | 
50 | 
51 | @torch.jit.script
52 | def bias_dropout_add_fused_inference(
53 |     x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
54 | ) -> Tensor:
55 |     return bias_dropout_add(x, bias, residual, prob, False)
56 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Model parallel utility interface."""
16 | 
17 | from .cross_entropy import vocab_parallel_cross_entropy
18 | 
19 | from .data import broadcast_data
20 | 
21 | from .initialize import is_unitialized
22 | from .initialize import destroy_model_parallel
23 | from .initialize import get_data_parallel_group
24 | from .initialize import get_data_parallel_rank
25 | from .initialize import get_data_parallel_world_size
26 | from .initialize import get_model_parallel_group
27 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
28 | from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
29 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
30 | from .initialize import get_topology
31 | from .initialize import get_pipe_parallel_group
32 | from .initialize import get_pipe_parallel_rank
33 | from .initialize import get_pipe_parallel_world_size
34 | from .initialize import get_io_parallel_group
35 | from .initialize import initialize_model_parallel
36 | from .initialize import model_parallel_is_initialized
37 | 
38 | from .layers import ColumnParallelLinear
39 | from .layers import RowParallelLinear
40 | from .layers import VocabParallelEmbedding
41 | from .layers import ParallelRelativePositionBias
42 | 
43 | from .mappings import copy_to_model_parallel_region
44 | from .mappings import gather_from_model_parallel_region
45 | from .mappings import reduce_from_model_parallel_region
46 | from .mappings import scatter_to_model_parallel_region
47 | 
48 | from .random import checkpoint
49 | from .random import get_cuda_rng_tracker
50 | from .random import model_parallel_cuda_manual_seed
51 | 
52 | from .utils import divide
53 | from .utils import split_tensor_along_last_dim
54 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/mpu/random.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
16 | # TODO: should be able to get rid of this file entirely
17 | 
18 | import deepspeed
19 | import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
20 | 
21 | # Default name for the model parallel rng tracker.
22 | _MODEL_PARALLEL_RNG_TRACKER_NAME = (
23 |     deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
24 | )
25 | 
26 | # Whether apply model parallelsim to checkpointed hidden states.
27 | _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
28 | 
29 | # RNG tracker object.
30 | _CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
31 | 
32 | # Deepspeed checkpointing functions
33 | # TODO: replace calls to these in our codebase with calls to the deepspeed ones
34 | _set_cuda_rng_state = checkpointing._set_cuda_rng_state
35 | checkpoint = checkpointing.checkpoint
36 | model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
37 | get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
38 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/neox_arguments/template.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | import logging
17 | 
18 | 
19 | @dataclass
20 | class NeoXArgsTemplate:
21 |     def defaults(self):
22 |         """
23 |         generator for getting default values.
24 |         """
25 |         for key, field_def in self.__dataclass_fields__.items():
26 |             yield key, field_def.default
27 | 
28 |     def update_value(self, key: str, value):
29 |         """
30 |         updates a property value if the key already exists
31 | 
32 |         Problem: a previously non-existing property can be added to the class instance without error.
33 |         """
34 |         if hasattr(self, key):
35 |             setattr(self, key, value)
36 |         else:
37 |             error_message = (
38 |                 self.__class__.__name__
39 |                 + ".update_value() to be updated property "
40 |                 + str(key)
41 |                 + " does not exist"
42 |             )
43 |             logging.error(error_message)
44 |             raise ValueError(error_message)
45 | 
46 |     def update_values(self, d):
47 |         """
48 |         Updates multiple values in self if the keys already exists
49 |         """
50 |         for k, v in d.items():
51 |             self.update_value(k, v)
52 | 


--------------------------------------------------------------------------------
/gpt-neox/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from .tokenizer import build_tokenizer
17 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8>=1.5.6
2 | clang-format>=13.0.1
3 | pre-commit>=2.17.0
4 | pytest>=6.2.3
5 | pytest-cov>=2.11.1
6 | pytest-forked>=1.3.0
7 | pytest-xdist
8 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-flashattention.txt:
--------------------------------------------------------------------------------
1 | flash-attn==2.2.1
2 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-onebitadam.txt:
--------------------------------------------------------------------------------
1 | cupy-cuda111>=8.6.0
2 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-s3.txt:
--------------------------------------------------------------------------------
1 | hf-transfer>=0.1.3
2 | boto3


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-sparseattention.txt:
--------------------------------------------------------------------------------
1 | triton==2.0.0.dev20221202
2 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-tensorboard.txt:
--------------------------------------------------------------------------------
1 | tensorboard==2.13.0
2 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements-wandb.txt:
--------------------------------------------------------------------------------
1 | wandb>=0.10.28
2 | 


--------------------------------------------------------------------------------
/gpt-neox/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | best_download
 2 | git+https://github.com/EleutherAI/DeeperSpeed.git#egg=deepspeed
 3 | ftfy>=6.0.1
 4 | git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 5 | huggingface_hub>=0.11.0
 6 | lm_eval==0.3.0
 7 | mpi4py>=3.0.3
 8 | numpy>=1.22.0
 9 | pybind11>=2.6.2
10 | regex
11 | sentencepiece
12 | six
13 | tiktoken>=0.1.2
14 | tokenizers>=0.12.1
15 | transformers==4.30.2
16 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/0.remove_nvidia_driver_and_cuda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo apt-get purge nvidia*
4 | sudo apt-get autoremove
5 | sudo apt-get autoclean
6 | sudo rm -rf /usr/local/cuda*
7 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/1.cuda_11_7_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt update
 4 | sudo apt install wget axel
 5 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
 6 | sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | axel -a -n 20 https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-ubuntu2004-11-7-local_11.7.0-515.43.04-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-ubuntu2004-11-7-local_11.7.0-515.43.04-1_amd64.deb
 9 | sudo cp /var/cuda-repo-ubuntu2004-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
10 | sudo apt-get update
11 | sudo apt-get -y install nvidia-driver-515
12 | echo "blacklist nouveau" | sudo tee /etc/modprobe.d/blacklist-nouveau.conf
13 | echo "options nouveau modeset=0" | sudo tee -a /etc/modprobe.d/blacklist-nouveau.conf
14 | sudo update-initramfs -u
15 | sudo apt-get -y install cuda-11-7
16 | 
17 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/11.cat_csv_from_log.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z $1 ]; then
 4 | 	echo "Usage: $0 [log_file]"
 5 | 	exit 0
 6 | fi
 7 | 
 8 | if [ ! -f $1 ]; then
 9 | 	echo "$1 is missing."
10 | 	exit 0
11 | fi
12 | 
13 | #grep -a lm_loss $1 | awk '{printf "%s,%s,%s,%f\n",substr($8, 1, length($8)-1),$2,substr($26,1,length($26)-6),$29}' | tee $1.csv
14 | grep -a lm_loss $1 | awk '{printf "%s,%f\n",$2,$5}' | tee $1.csv
15 | grep -a lm_loss $1 | awk '{printf "%s,%s,%f\n",substr($8,1,length($8)-1),substr($26,1,length($26)-6),$35}' | tee $1.csv
16 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/12.run_and_collect_logs_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #CONFIGS=(125M.yml 6-7B.yml)
 4 | CONFIG=6-7B.yml
 5 | CONT_NAME="gpt-neox-container"
 6 | #BATCHS=(1 2 4 8 16 32 64 128)
 7 | BATCHS=(1 2 4 8 16 32)
 8 | #GPUS=(1 2 3 4 5 6 7 8)
 9 | #gpus per node
10 | GPUS=4
11 | NODES=(s1 s8)
12 | #Pipeline Parallel
13 | PP=(1 2 4 8)
14 | #PP=(8)
15 | TRAIN_ITERS=500
16 | TARGET_LM_LOSS=0
17 | TRAIN_TIME=600
18 | HOSTFILE=./scripts_swsok/hostfile
19 | 
20 | if [ ! -z "$1" ]; then
21 | 	GPUS=$1
22 | fi
23 | 
24 | rm logs/*
25 | rm checkpoints/* -rf
26 | mkdir swsok-results/
27 | mkdir checkpoints/
28 | 
29 | rm $HOSTFILE
30 | for i in ${NODES[@]}; do
31 | 	echo "$i slots=$GPUS" >> $HOSTFILE
32 | done
33 | 
34 | sed -i "/\"train_iters\"/c\   \"train_iters\": \\$TRAIN_ITERS," configs/$CONFIG
35 | sed -i "/\"lr_decay_iters\"/c\   \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$CONFIG
36 | sed -i "/\"target_lm_loss\"/c\   \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$CONFIG
37 | sed -i "/\"target_time_in_sec\"/c\   \"target_time_in_sec\": \\$TRAIN_TIME," configs/$CONFIG
38 | 
39 | for i in ${NODES[@]}; do
40 | 	ssh $i docker stop $CONT_NAME
41 | 	ssh $i docker run -d -it --name $CONT_NAME --rm --network host --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=/var/nfs,dst=/var/nfs swsok/gpt-neox:v7
42 | done
43 | 
44 | for p in ${PP[@]}; do
45 | 	for b in ${BATCHS[@]}; do
46 | 		echo "$CONFIG Nodes ${#NODES[@]} GPUS $GPUS BATCH $b Pipeline $p" > logs/current_test_setting.txt
47 | 			
48 | 		sed -i "/\"train_micro_batch_size_per_gpu\"/c\   \"train_micro_batch_size_per_gpu\": \\$b," configs/$CONFIG
49 | 		sed -i "/\"pipe_parallel_size\"/c\   \"pipe_parallel_size\": \\$p," configs/$CONFIG
50 | 
51 | 		docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$CONFIG configs/local_setup.yml configs/etri_cluster.yml
52 | 
53 | 		mv logs/*stdout.txt swsok-results/conf-$CONFIG-gpus-$GPUS-pp-$p-microbatch-$b-$(date '+%Y-%m-%d').txt
54 | 		rm logs/*
55 | 		rm checkpoints/* -rf
56 | 
57 | 		sleep 1
58 | 	done
59 | done
60 | 
61 | for i in ${NODES[@]}; do
62 | 	ssh $i docker stop $CONT_NAME
63 | done
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/15.long_seqlen_1.3B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CONFIG=760M-32k-len-conf.yml
 4 | CONT_NAME="gpt-neox-container"
 5 | BATCHS=(1 2)
 6 | GPUS=8
 7 | #SEQLEN=(2048 4096 8192 16384 32768)
 8 | SEQLEN=(32768)
 9 | #SEQLEN=(32768)
10 | TRAIN_ITERS=20
11 | TARGET_LM_LOSS=0
12 | TRAIN_TIME=1000
13 | GRADACCSTEP=(32 64)
14 | 
15 | docker stop $CONT_NAME
16 | sudo rm logs/* -rf
17 | rm checkpoints/* -rf
18 | 
19 | i=$GPUS
20 | conf=$CONFIG
21 | 
22 | #docker run -d -it --name $CONT_NAME --rm --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
23 | docker run -d -it --name $CONT_NAME --rm --gpus $i -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
24 | 
25 | 
26 | sed -i "/\"train_iters\"/c\   \"train_iters\": \\$TRAIN_ITERS," configs/$conf
27 | sed -i "/\"lr_decay_iters\"/c\   \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$conf
28 | #sed -i "/\"target_lm_loss\"/c\   \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$conf
29 | #sed -i "/\"target_time_in_sec\"/c\   \"target_time_in_sec\": \\$TRAIN_TIME," configs/$conf
30 | 
31 | for b in ${BATCHS[@]}; do
32 | 	sed -i "/\"train_micro_batch_size_per_gpu\"/c\   \"train_micro_batch_size_per_gpu\": \\$b," configs/$conf
33 | 
34 | 	for s in ${SEQLEN[@]}; do
35 | 		echo "$conf GPU $i microbatch $b pp $p mp $m" > logs/current_test_setting.txt
36 | 			
37 | 		sed -i "/\"seq_length\"/c\   \"seq_length\": \\$s," configs/$conf
38 | 		sed -i "/\"max_position_embeddings\"/c\   \"max_position_embeddings\": \\$s," configs/$conf
39 | 
40 | 		for g in ${GRADACCSTEP[@]}; do
41 | 			sed -i "/\"gradient_accumulation_steps\"/c\   \"gradient_accumulation_steps\": \\$g," configs/$conf
42 | 
43 | 			docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$conf configs/enwik8.yml
44 | 			sudo mv -f logs/*stdout.txt swsok-results/conf-$conf-gpunum-$i-zero-3-microbatch-$b-seqlen-$s-gradaccustep-$g-$(date '+%Y-%m-%d').txt
45 | 			sudo rm -rf logs/*
46 | 			rm checkpoints/* -rf
47 | 		done
48 | 	done
49 | 
50 | 	sleep 1
51 | done
52 | 
53 | docker stop $CONT_NAME
54 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/16.zero_opt_stages_1.3B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CONFIG=760M-32k-len-conf.yml
 4 | CONT_NAME="gpt-neox-container"
 5 | BATCHS=(2)
 6 | GPUS=8
 7 | #SEQLEN=(2048 4096 8192 16384 32768)
 8 | SEQLEN=(32768)
 9 | #SEQLEN=(32768)
10 | TRAIN_ITERS=10
11 | TARGET_LM_LOSS=0
12 | TRAIN_TIME=1000
13 | GRADACCSTEP=(8 16 32 64)
14 | ZERO_STAGE=(2 3)
15 | 
16 | docker stop $CONT_NAME
17 | sudo rm logs/* -rf
18 | rm checkpoints/* -rf
19 | 
20 | i=$GPUS
21 | conf=$CONFIG
22 | 
23 | #docker run -d -it --name $CONT_NAME --rm --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
24 | docker run -d -it --name $CONT_NAME --rm --gpus $i -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
25 | 
26 | 
27 | sed -i "/\"train_iters\"/c\   \"train_iters\": \\$TRAIN_ITERS," configs/$conf
28 | sed -i "/\"lr_decay_iters\"/c\   \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$conf
29 | #sed -i "/\"target_lm_loss\"/c\   \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$conf
30 | #sed -i "/\"target_time_in_sec\"/c\   \"target_time_in_sec\": \\$TRAIN_TIME," configs/$conf
31 | 
32 | for b in ${BATCHS[@]}; do
33 | 	sed -i "/\"train_micro_batch_size_per_gpu\"/c\   \"train_micro_batch_size_per_gpu\": \\$b," configs/$conf
34 | 
35 | 	for s in ${ZERO_STAGE[@]}; do
36 | 		echo "$conf GPU $i microbatch $b pp $p mp $m" > logs/current_test_setting.txt
37 | 			
38 | 		sed -i "/\"stage\"/c\   \"stage\": \\$s," configs/$conf
39 | 
40 | 		for g in ${GRADACCSTEP[@]}; do
41 | 			sed -i "/\"gradient_accumulation_steps\"/c\   \"gradient_accumulation_steps\": \\$g," configs/$conf
42 | 
43 | 			docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$conf configs/enwik8.yml
44 | 			sudo mv -f logs/*stdout.txt swsok-results/conf-$conf-gpunum-$i-zero-3-microbatch-$b-seq-32k-stage-$s-gradaccustep-$g-$(date '+%Y-%m-%d').txt
45 | 			sudo rm -rf logs/*
46 | 			rm checkpoints/* -rf
47 | 		done
48 | 	done
49 | 
50 | 	sleep 1
51 | done
52 | 
53 | docker stop $CONT_NAME
54 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/17.760M_zero_stages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CONFIG=760M-32k-len-conf.yml
 4 | CONT_NAME="gpt-neox-container"
 5 | BATCHS=(1 2 4 8)
 6 | GPUS=1
 7 | #SEQLEN=(2048 4096 8192 16384 32768)
 8 | SEQLEN=(32768)
 9 | #SEQLEN=(32768)
10 | TRAIN_ITERS=10
11 | TARGET_LM_LOSS=0
12 | TRAIN_TIME=1000
13 | GRADACCSTEP=(8 16 32 64)
14 | ZERO_STAGE=(0 1 2 3)
15 | 
16 | docker stop $CONT_NAME
17 | sudo rm logs/* -rf
18 | rm checkpoints/* -rf
19 | 
20 | i=$GPUS
21 | conf=$CONFIG
22 | 
23 | #docker run -d -it --name $CONT_NAME --rm --gpus $GPUS -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
24 | docker run -d -it --name $CONT_NAME --rm --gpus $i -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
25 | 
26 | 
27 | sed -i "/\"train_iters\"/c\   \"train_iters\": \\$TRAIN_ITERS," configs/$conf
28 | sed -i "/\"lr_decay_iters\"/c\   \"lr_decay_iters\": \\$TRAIN_ITERS," configs/$conf
29 | #sed -i "/\"target_lm_loss\"/c\   \"target_lm_loss\": \\$TARGET_LM_LOSS," configs/$conf
30 | #sed -i "/\"target_time_in_sec\"/c\   \"target_time_in_sec\": \\$TRAIN_TIME," configs/$conf
31 | 
32 | for b in ${BATCHS[@]}; do
33 | 	sed -i "/\"train_micro_batch_size_per_gpu\"/c\   \"train_micro_batch_size_per_gpu\": \\$b," configs/$conf
34 | 
35 | 	for s in ${ZERO_STAGE[@]}; do
36 | 		echo "$conf GPU $i microbatch $b pp $p mp $m" > logs/current_test_setting.txt
37 | 			
38 | 		sed -i "/\"stage\"/c\   \"stage\": \\$s," configs/$conf
39 | 
40 | 		for g in ${GRADACCSTEP[@]}; do
41 | 			sed -i "/\"gradient_accumulation_steps\"/c\   \"gradient_accumulation_steps\": \\$g," configs/$conf
42 | 
43 | 			docker exec -it -w /gpt-neox $CONT_NAME ./deepy.py train.py configs/$conf configs/enwik8.yml
44 | 			sudo mv -f logs/*stdout.txt swsok-results/conf-$conf-gpunum-$i-zero-3-microbatch-$b-seq-32k-stage-$s-gradaccustep-$g-$(date '+%Y-%m-%d').txt
45 | 			sudo rm -rf logs/*
46 | 			rm checkpoints/* -rf
47 | 		done
48 | 	done
49 | 
50 | 	sleep 1
51 | done
52 | 
53 | docker stop $CONT_NAME
54 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/2.docker_and_nvidia_container_toolkit_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #remove docker and reinstall
 4 | for pkg in docker.io docker-doc docker-compose docker-compose-v2 podman-docker containerd runc; do 
 5 | 	sudo apt-get remove $pkg; 
 6 | done
 7 | 
 8 | sudo apt-get update
 9 | sudo apt-get install ca-certificates curl gnupg
10 | sudo install -m 0755 -d /etc/apt/keyrings
11 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
12 | sudo chmod a+r /etc/apt/keyrings/docker.gpg
13 | 
14 | # Add the repository to Apt sources:
15 | echo \
16 |   "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
17 |     "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \
18 |       sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
19 | 
20 | sudo apt-get update
21 | 
22 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
23 | 
24 | #install nvidia-container-toolkit
25 | curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
26 | curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
27 |     sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
28 |     sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
29 | sudo apt-get update
30 | sudo apt-get install -y nvidia-container-toolkit
31 | 
32 | sudo usermod -aG docker $USER
33 | sudo service docker restart
34 | #logout and login to run docker without sudo
35 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/3.required_packages_install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo apt install libopenmpi-dev
4 | pip install mpi4py
5 | # urllib3 v2 conflicts with original gpt-neox codes
6 | #pip uninstall urllib3
7 | #pip install urllib3==1.26.16
8 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/4.requirements_install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install -r requirements/requirements.txt
4 | pip install -r requirements/requirements-wandb.txt
5 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/5.prepare_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export PYTHONPATH=~/.local/lib/python3.8/site-packages:/usr/lib/python3/dist-packages
4 | 
5 | python prepare_data.py -d ./data
6 | 
7 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/6.pretrain_125M_local.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./deepy.py train.py configs/125M.yml configs/local_setup.yml
4 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/7.patch_best_download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sed -i 's/requests.packages.urllib3.util.retry/urllib3.util.retry/g' ~/.local/lib/python3.8/site-packages/best_download/__init__.py
4 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/8.print_loss_progress.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | grep lm_loss logs/gptneox-test_stdout.txt | awk '{print $26}'
4 | 
5 | #watch -n 10 "grep lm_loss gptneox-test_stdout.txt | awk '{print \$5\$6\" \"\$26}' | tail -n 10"
6 | #grep lm_loss gptneox-test_stdout.txt | awk '{print $5 $6 " " $23"/222.2TFLOPS"  " " $26}'
7 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/9.run_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #docker run --rm -it --network host --gpus=all -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=/var/nfs,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
4 | docker run --rm -it --network host --gpus=all -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox --mount type=bind,src=./dataset,dst=/data --security-opt seccomp=seccomp-docker.json swsok/gpt-neox:v8
5 | 


--------------------------------------------------------------------------------
/gpt-neox/scripts_swsok/run_sshd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo /etc/init.d/ssh start
4 | 
5 | /bin/bash
6 | 


--------------------------------------------------------------------------------
/gpt-neox/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | 
 3 | Tests use pytests with coverage and forked plugins. Install with:
 4 | 
 5 | ```bash
 6 | pip install -r requirements/requirements-dev.txt
 7 | ```
 8 | 
 9 | Download the required test data
10 | ```bash
11 | python prepare_data.py
12 | ```
13 | 
14 | # Run
15 | 
16 | Tests can be run using pytest.
17 | 
18 | * The argument --forked needs to be provided
19 | * A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation)
20 | * A subset of tests can be selected by pointing to the module within tests
21 | 
22 | ```bash
23 | # run all tests, output coverage report of megatron module in terminal
24 | pytest --forked --cov-report term --cov=megatron tests
25 | 
26 | # run tests in tests/model, output coverage report of megatron module as html
27 | pytest --forked --cov-report html --cov=megatron tests/model
28 | 
29 | # run tests in tests/model/test_model_generation.py, don't output coverage report
30 | pytest --forked tests/model/test_model_generation.py
31 | ```
32 | 
33 | Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
34 | The test cases for cpu can be run with:
35 | ````
36 | pytest tests -m cpu
37 | ```
38 | 
39 | If a html coverage report has been created a simple http server can be run to serve static files.
40 | 
41 | ```bash
42 | python -m http.server --directory htmlcov 8000
43 | ```
44 | 
45 | 
46 | ## Tips and Tricks
47 | if You see this kind of error:
48 | ```
49 | RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
50 | ```
51 | It means that you used some pytorch.cuda function before the test creates the processes.
52 | 


--------------------------------------------------------------------------------
/gpt-neox/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/gpt-neox/tests/__init__.py


--------------------------------------------------------------------------------
/gpt-neox/tests/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .test_model_instantiation import run_test_model_instantiation
16 | from .test_model_train import run_train_test
17 | from .test_model_checkpoint import run_checkpoint_test
18 | 


--------------------------------------------------------------------------------
/gpt-neox/tests/neox_args/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | testing of implementation of command line arguments and configuration (NeoXArgs)
3 | """
4 | 


--------------------------------------------------------------------------------
/gpt-neox/tests/neox_args/test_neoxargs_implementation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | check implementation of NeoXArgs for duplication errors (would overwrite)
17 | """
18 | import pytest
19 | 
20 | 
21 | @pytest.mark.cpu
22 | def test_neoxargs_duplicates():
23 |     """
24 |     tests that there are no duplicates among parent classes of NeoXArgs
25 |     """
26 |     from megatron import NeoXArgs
27 | 
28 |     assert NeoXArgs.validate_keys(), "test_neoxargs_duplicates"
29 | 


--------------------------------------------------------------------------------
/gpt-neox/tests/pytest.ini:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [pytest]
16 | markers =
17 |     cpu: marks tests that can be run on cpu
18 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-NeoX Auxiliary Tools
 2 | 
 3 | This directory contains a number of auxiliary tools that are useful for working with GPT-NeoX but not part of the main training code.
 4 | 
 5 | ## Bash
 6 | 
 7 | This directory contains some simple, frequently used bash commands to make working on multiple machines easier.
 8 | 
 9 | ## Checkpoints
10 | 
11 | This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x.
12 | 
13 | ## Datasets
14 | 
15 | This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library.
16 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/bash/README.md:
--------------------------------------------------------------------------------
1 | # Bash Scripts
2 | Useful for running distributed per-node scripts on e.g. Kubernetes 
3 | 
4 | * `kill.sh` kills all python processes
5 | * `killall.sh` uses pdsh to kill all `train.py` processes on the nodes listed in `/job/hosts/`
6 | * `sync_cmd.sh` uses pdsh to run a command on all the nodes listed in `/job/hosts/`
7 | * `sync.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
8 | * `syncdir.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
9 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/bash/kill.sh:
--------------------------------------------------------------------------------
1 | pkill -9 python
2 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/bash/killall.sh:
--------------------------------------------------------------------------------
1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
2 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/bash/sync.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     echo Uploading $full_path
27 |     pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
28 | done
29 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/bash/sync_cmd.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Runs a command in parallel across all nodes
18 | # Usage
19 | # sync_cmd.sh 'echo "hello world"'
20 | 
21 | echo "Command: $1";
22 | pdsh -R ssh -w ^/job/hosts $1
23 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/bash/syncdir.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # syncdir.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     parentdir="$(dirname "$full_path")"
27 |     echo Uploading $full_path to $parentdir
28 |     pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
29 | done
30 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/ckpts/upload.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import sys
17 | 
18 | from huggingface_hub import HfApi, create_repo
19 | 
20 | converted_ckpt = sys.argv[1]
21 | repo_name = sys.argv[2]
22 | branch_name = sys.argv[3]
23 | try:
24 |     create_repo(repo_name, repo_type="model", private=False)
25 | except:
26 |     print("repo {repo_name} already exists!")
27 |     pass
28 | 
29 | files = os.listdir(converted_ckpt)
30 | 
31 | api = HfApi()
32 | if branch_name != "main":
33 |     try:
34 |         api.create_branch(
35 |             repo_id=repo_name,
36 |             repo_type="model",
37 |             branch=branch_name,
38 |         )
39 |     except:
40 |         print(f"branch {branch_name} already exists, try again...")
41 | print(f"to upload: {files}")
42 | for file in files:
43 |     print(f"Uploading {file} to branch {branch_name}...")
44 |     api.upload_file(
45 |         path_or_fileobj=os.path.join(converted_ckpt, file),
46 |         path_in_repo=file,
47 |         repo_id=repo_name,
48 |         repo_type="model",
49 |         commit_message=f"Upload {file}",
50 |         revision=branch_name,
51 |     )
52 |     print(f"Successfully uploaded {file} !")
53 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/kill.sh:
--------------------------------------------------------------------------------
1 | pkill -9 python
2 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/killall.sh:
--------------------------------------------------------------------------------
1 | pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
2 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/sync.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     echo Uploading $full_path
27 |     pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
28 | done
29 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/sync_cmd.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Runs a command in parallel across all nodes
18 | # Usage
19 | # sync_cmd.sh 'echo "hello world"'
20 | 
21 | echo "Command: $1";
22 | pdsh -R ssh -w ^/job/hosts $1
23 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/syncdir.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env bash
16 | 
17 | # Push files to all nodes
18 | # Usage
19 | # sync.sh file [file2..]
20 | 
21 | echo Number of files to upload: $#
22 | 
23 | for file in "$@"
24 | do
25 |     full_path=$(realpath $file)
26 |     parentdir="$(dirname "$full_path")"
27 |     echo Uploading $full_path to $parentdir
28 |     pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
29 | done
30 | 


--------------------------------------------------------------------------------
/gpt-neox/tools/upload.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import sys
17 | 
18 | from huggingface_hub import HfApi, create_repo
19 | 
20 | converted_ckpt = sys.argv[1]
21 | repo_name = sys.argv[2]
22 | branch_name = sys.argv[3]
23 | try:
24 |     create_repo(repo_name, repo_type="model", private=False)
25 | except:
26 |     print("repo {repo_name} already exists!")
27 |     pass
28 | 
29 | files = os.listdir(converted_ckpt)
30 | 
31 | api = HfApi()
32 | if branch_name != "main":
33 |     try:
34 |         api.create_branch(
35 |             repo_id=repo_name,
36 |             repo_type="model",
37 |             branch=branch_name,
38 |         )
39 |     except:
40 |         print(f"branch {branch_name} already exists, try again...")
41 | print(f"to upload: {files}")
42 | for file in files:
43 |     print(f"Uploading {file} to branch {branch_name}...")
44 |     api.upload_file(
45 |         path_or_fileobj=os.path.join(converted_ckpt, file),
46 |         path_in_repo=file,
47 |         repo_id=repo_name,
48 |         repo_type="model",
49 |         commit_message=f"Upload {file}",
50 |         revision=branch_name,
51 |     )
52 |     print(f"Successfully uploaded {file} !")
53 | 


--------------------------------------------------------------------------------
/gpt-neox/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Train"""
19 | from megatron.neox_arguments import NeoXArgs
20 | from megatron.training import pretrain
21 | 
22 | if __name__ == "__main__":
23 |     neox_args = NeoXArgs.consume_neox_args()
24 |     neox_args.configure_distributed_args()
25 |     neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
26 |     neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
27 |     pretrain(neox_args=neox_args)
28 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/00-prepare-nodes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt update
 4 | sudo apt upgrade -y
 5 | 
 6 | #disable swap
 7 | sudo swapoff -a
 8 | sudo sed -e '/swap/ s/^#*/#/' -i /etc/fstab
 9 | 
10 | nodelistfile='nodes.txt'
11 | USER=etri-aicomputing
12 | 
13 | if [ -e $nodelistfile ]; then
14 | 	while read p; do
15 | 		ssh-copy-id $USER@$p
16 | 		#for passwordless sudo
17 | 		#ssh $USER@$p sudo bash -c 'echo "etri-aicomputing ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers'
18 | 	done < "$nodelistfile"
19 | fi
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/01-install-cudnn-and-nvidia-driver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt-get -y update
 4 | sudo apt-get -y remove --purge '^nvidia-.*'
 5 | sudo apt-get -y remove --purge 'cuda-.*'
 6 | 
 7 | sudo bash -c "echo blacklist nouveau > /etc/modprobe.d/blacklist-nvidia-nouveau.conf"
 8 | sudo bash -c "echo options nouveau modeset=0 >> /etc/modprobe.d/blacklist-nvidia-nouveau.conf"
 9 | sudo update-initramfs -u
10 | 
11 | sudo apt-get -y install nvidia-cuda-toolkit
12 | nvcc -V
13 | whereis cuda
14 | #mkdir ~/nvidia
15 | #cd ~/nvidia
16 | CUDNN_DEB_FILE="cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb"
17 | if ! [ -e $CUDNN_DEB_FILE ]; then
18 | 	sudo apt-get -y install axel
19 | 	axel -n 20  https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/${CUDNN_DEB_FILE}
20 | <<<<<<< HEAD
21 | =======
22 | #	wget  https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/${CUDNN_DEB_FILE}
23 | >>>>>>> 1fa1c6b77722000453667cc07bfd33920d5e633e
24 | fi
25 | sudo dpkg -i ${CUDNN_DEB_FILE}
26 | sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
27 | sudo apt update
28 | sudo apt -y install libcudnn8=8.8.0.121-1+cuda12.0
29 | sudo apt -y install libcudnn8-dev=8.8.0.121-1+cuda12.0
30 | sudo apt -y install libcudnn8-samples=8.8.0.121-1+cuda12.0
31 | 
32 | source ~/.bashrc
33 | 
34 | sudo apt install -y ubuntu-drivers-common
35 | ubuntu-drivers devices
36 | sudo apt install -y nvidia-driver-525-server
37 | 
38 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/02-install-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg lsb-release
 3 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
 4 | echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
 5 | sudo apt-get update -y
 6 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io
 7 | sudo docker run hello-world
 8 | sudo usermod -aG docker $USER 
 9 | #&& newgrp docker
10 | sudo service docker restart
11 | 
12 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/03-install-nvidia-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 3 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 4 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 5 | sudo apt-get -y update
 6 | sudo apt-get -y install nvidia-docker2
 7 | sudo systemctl restart docker
 8 | sudo docker run --runtime nvidia nvidia/cuda:10.1-base /usr/bin/nvidia-smi
 9 | 
10 | sudo bash -c 'cat <<EOF > /etc/docker/daemon.json
11 | {
12 |     "exec-opts": ["native.cgroupdriver=systemd"],
13 |     "log-driver": "json-file",
14 |     "log-opts": {
15 |         "max-size": "100m"
16 |     },
17 |     "data-root": "/mnt/storage/docker_data",
18 |     "storage-driver": "overlay2",
19 |     "default-runtime" : "nvidia",
20 |     "runtimes" : {
21 |         "nvidia" : {
22 |             "path": "/usr/bin/nvidia-container-runtime",
23 |             "runtimeArgs" : []
24 |         }
25 |     }
26 | }
27 | EOF'
28 | sudo systemctl restart docker
29 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/04-install-k8s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | sudo swapoff -a
 3 | sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
 4 | 
 5 | sudo apt-get install -y iptables arptables ebtables
 6 | sudo apt-get update && sudo apt-get install -y apt-transport-https curl
 7 | curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
 8 | cat <<EOF | sudo tee /etc/apt/sources.list.d/kubernetes.list
 9 | deb https://apt.kubernetes.io/ kubernetes-xenial main
10 | EOF
11 | 
12 | sudo apt-get update
13 | sudo apt-get install -y kubelet=1.21.10-00 kubeadm=1.21.10-00 kubectl=1.21.10-00 --allow-downgrades --allow-change-held-packages
14 | sudo apt-mark hold kubelet kubeadm kubectl
15 | kubeadm version
16 | kubelet --version
17 | kubectl version --client
18 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/05-init-k8s-master-only.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # init k8s
 3 | sudo kubeadm init --pod-network-cidr=10.217.0.0/16
 4 | 
 5 | mkdir -p $HOME/.kube
 6 | sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
 7 | sudo chown $(id -u):$(id -g) $HOME/.kube/config
 8 | 
 9 | 
10 | kubectl cluster-info
11 | 
12 | # CNI
13 | kubectl create -f https://raw.githubusercontent.com/cilium/cilium/v1.6/install/kubernetes/quick-install.yaml
14 | kubectl get pods -n kube-system --selector=k8s-app=cilium
15 | 
16 | kubectl taint nodes --all node-role.kubernetes.io/master-
17 | kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta6/nvidia-device-plugin.yml
18 | 
19 | 
20 | #test GPU
21 | kubectl -n kube-system get pod -l name=nvidia-device-plugin-ds
22 | kubectl -n kube-system logs  -l name=nvidia-device-plugin-ds
23 | 
24 | # default storageclass
25 | kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/master/deploy/local-path-storage.yaml
26 | kubectl get storageclass
27 | kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
28 | kubectl get sc
29 | 
30 | # install kustomize 
31 | # 
32 | if [ ! -f /usr/local/bin/kusomize ]
33 |   then
34 |     echo "kustomize"
35 |     wget https://github.com/kubernetes-sigs/kustomize/releases/download/v3.2.0/kustomize_3.2.0_linux_amd64
36 |     mv ./kustomize_3.2.0_linux_amd64 kustomize
37 |     sudo chmod 777 kustomize
38 |     sudo mv kustomize /usr/local/bin/kustomize
39 | fi
40 | 
41 | 
42 | # autocomplete k8s
43 | shellname=`echo $SHELL | rev | cut -d '/' -f1 | rev`
44 | shellconf=`echo ~/.\${shellname}rc`
45 | grep -n "kubectl completion" $shellconf
46 | 
47 | if [ $? = 1 ]
48 |   then
49 |     echo 'install autocomplete k8s'
50 |     sudo apt-get install bash-completion -y
51 |     echo 'source <(kubectl completion '$shellname')' >>$shellconf
52 |     echo 'alias k=kubectl' >>$shellconf
53 |     echo 'complete -F __start_kubectl k' >>$shellconf
54 | fi
55 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/06-install-kubeflow-master-only.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ~
 4 | git clone https://github.com/kubeflow/manifests.git
 5 | cd manifests
 6 | git checkout v1.6.0
 7 | 
 8 | kustomize build common/cert-manager/cert-manager/base | kubectl apply -f -
 9 | kubectl wait --for=condition=ready pod -l 'app in (cert-manager,webhook)' --timeout=180s -n cert-manager
10 | kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f -
11 | 
12 | while ! kustomize build example | awk '!/well-defined/' | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
13 | 
14 | # wait until all pods become Running state
15 | watch kubectl get pod -A
16 | 
17 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/07-certificate-kubeflow-master-only.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl apply -f gateway.yaml
4 | kubectl apply -f certificate.yaml
5 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/08-port-forward-kubeflow-master-only.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nohup kubectl port-forward --address="0.0.0.0" svc/istio-ingressgateway -n istio-system 8080:443 &
4 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/09-print-join-cmd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cmd=$(kubeadm token create --print-join-command)
4 | echo "sudo $cmd"
5 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/10-enable-k8s-dashboard-master-only.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #kubectl apply -f dashboard-adminuser.yaml
 4 | 
 5 | #kubectl apply -f cluster-role-binding.yaml
 6 | 
 7 | #kubectl -n kubernetes-dashboard describe secrets
 8 | # copy admin-user's token value for kubernetes-dashboard
 9 | 
10 | kubectl create serviceaccount admin-user
11 | kubectl create clusterrolebinding test-user-binding --clusterrole=cluster-admin --serviceaccount=default:admin-user
12 | kubectl get secrets
13 | #specify admin-user's token name in next cmd
14 | kubectl describe secret admin-user-token-8bjnr
15 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/11-reset-k8s.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo kubeadm reset
4 | rm ~/.kube/config
5 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/12-add-kubeflow-user.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl create -f profile.yaml
4 | 
5 | #kubectl apply -f profile.yaml  #if you are modifying the profile
6 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/13-port-forward-k8s-container.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl port-forward test222-0 60022:22 -n aicomputing1
4 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/14-remove-a-node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z $1 ]; then
 4 | 	echo "Usage: $0 [node name to remove]"
 5 | 	exit 0
 6 | fi
 7 | 
 8 | kubectl drain $1 --delete-local-data --force --ignore-daemonsets 
 9 | kubectl delete node $1
10 | 
11 | ssh $1 "sudo kubeadm reset"
12 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/certificate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Certificate
 3 | metadata:
 4 |   name: kubeflow-ingressgateway-certs
 5 |   namespace: istio-system
 6 | spec:
 7 |   commonName: example.com #Domain name
 8 |   issuerRef:
 9 |     kind: ClusterIssuer
10 |     name: kubeflow-self-signing-issuer
11 |   secretName: kubeflow-ingressgateway-certs
12 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/cluster-role-binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: admin-user
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: cluster-admin
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: admin-user
12 |   namespace: kubernetes-dashboard
13 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/dashboard-adminuser.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: admin-user
5 |   namespace: kubernetes-dashboard
6 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/gateway.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: Gateway
 3 | metadata:
 4 |   name: kubeflow-gateway
 5 |   namespace: kubeflow
 6 | spec:
 7 |   selector:
 8 |     istio: ingressgateway
 9 |   servers:
10 |   - hosts:
11 |     - "*"
12 |     port:
13 |       name: http
14 |       number: 80
15 |       protocol: HTTP
16 |     # Upgrade HTTP to HTTPS
17 |     tls:
18 |       httpsRedirect: true
19 |   - hosts:
20 |     - "*"
21 |     port:
22 |       name: https
23 |       number: 443
24 |       protocol: HTTPS
25 |     tls:
26 |       mode: SIMPLE
27 |       credentialName: kubeflow-ingressgateway-certs
28 |       
29 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing1
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing1@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile1.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing1
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing1@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile2.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing2
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing2@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile3.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing3
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing3@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile4.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing4
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing4@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile5.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing5
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing5@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile6.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing6
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing6@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile7.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing7
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing7@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile8.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing8
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing8@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/common/profile9.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: aicomputing9
 5 | #replace with the name of profile you want, this will be user's namespace name
 6 | spec:
 7 |   owner:
 8 |     kind: User
 9 |     name: aicomputing9@etri.re.kr
10 |     #replace with the email of the user
11 | 
12 |   resourceQuotaSpec:
13 |   #resource quota can be set optionally
14 |   #  hard:
15 |   #    cpu: "2"
16 |   #    memory: 2 Gi
17 |   #    requests.nvidia.com / gpu: "1"
18 |   #    persistentvolumeclaims: "1"
19 |   #    requests.storage: "5Gi"
20 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/docker/Dockerfile.org:
--------------------------------------------------------------------------------
 1 | #FROM public.ecr.aws/j1r0q0g6/notebooks/notebook-servers/jupyter-tensorflow-cuda-full:v1.5.0
 2 | FROM public.ecr.aws/j1r0q0g6/notebooks/notebook-servers/jupyter-pytorch-cuda-full:v1.5.0
 3 | USER root 
 4 | ENV NB_USER=jovyan
 5 | 
 6 | # Nvidia GPG Public Key 교체
 7 | #RUN rm /etc/apt/sources.list.d/cuda.list \
 8 | #  && rm /etc/apt/sources.list.d/nvidia-ml.list
 9 | 
10 | RUN apt-get update && apt-get install -y --no-install-recommends \
11 | 	  sudo \
12 | 	  apt-utils \
13 |   && usermod -aG sudo ${NB_USER} \
14 |   && echo ${NB_USER}:${NB_USER} | chpasswd \
15 |   && echo "${NB_USER} ALL=(root) NOPASSWD:SETENV: /init" >> /etc/sudoers
16 | 
17 | # install - requirements.txt
18 | # COPY requirements.txt requirements.txt
19 | # RUN pip3 install -r requirements.txt
20 | 
21 | USER $NB_USER
22 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/docker/cuda-requirements.txt:
--------------------------------------------------------------------------------
1 | --find-links https://download.pytorch.org/whl/torch_stable.html
2 | torch==1.8.1+cu111
3 | torchvision==0.9.1+cu111
4 | torchaudio==0.8.1
5 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/docker/make_dockerimage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | cp Dockerfile.scratch Dockerfile
 5 | docker build --no-cache -t swsok/nvidia-pytorch-kubeflow:v1 .
 6 | docker login --username swsok --password etri-aicomputing
 7 | docker push swsok/nvidia-pytorch-kubeflow:v1
 8 | 
 9 | #cp Dockerfile.org Dockerfile
10 | #docker build --no-cache -t swsok/jupyter-pytorch-cuda-full-sudo:v1.5.0 .
11 | #docker login --username swsok --password etri-aicomputing
12 | #docker push swsok/jupyter-pytorch-cuda-full-sudo:v1.5.0
13 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | jupyterlab==3.4.3
 3 | notebook==6.4.12
 4 | ipykernel==6.15.0
 5 | # kubeflow packages
 6 | kfp==1.6.3
 7 | kfp-server-api==1.6.0
 8 | kfserving==0.5.1
 9 | 
10 | # common packages
11 | bokeh==2.3.2
12 | cloudpickle==1.6.0
13 | dill==0.3.4
14 | ipympl==0.7.0
15 | ipywidgets==7.6.3
16 | jupyterlab-git==0.30.1
17 | matplotlib==3.4.2
18 | pandas==1.2.4
19 | scikit-image==0.18.1
20 | scikit-learn==0.24.2
21 | scipy==1.7.0
22 | seaborn==0.11.1
23 | xgboost==1.4.2
24 | 
25 | # pytorch packages
26 | #torchelastic==0.2.2 this currently causes a dependency conflict, should be fixed very soon
27 | fastai==2.4
28 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/docker/s6/cont-init.d/01-copy-tmp-home:
--------------------------------------------------------------------------------
1 | #!/usr/bin/with-contenv bash
2 | cp -r -n /tmp_home/* /home/
3 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/docker/s6/services.d/jupyterlab/run:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/with-contenv bash
 2 | cd "${HOME}"
 3 | exec /opt/conda/bin/jupyter lab \
 4 |   --notebook-dir="${HOME}" \
 5 |   --ip=0.0.0.0 \
 6 |   --no-browser \
 7 |   --allow-root \
 8 |   --port=8888 \
 9 |   --ServerApp.token="" \
10 |   --ServerApp.password="" \
11 |   --ServerApp.allow_origin="*" \
12 |   --ServerApp.base_url="${NB_PREFIX}" \
13 |   --ServerApp.authenticate_prometheus=False
14 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/setup_for_gpu_node_master.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ORG_DIR=$PWD
 4 | PROGRESS_FILE="$PWD/progress.stat"
 5 | 
 6 | chmod a+x common/*.sh
 7 | 
 8 | # create progress stat file
 9 | STAGE=$(<$PROGRESS_FILE)
10 | 
11 | cd common
12 | 
13 | if ! [ -e $PROGRESS_FILE ]; then
14 | 	./00-prepare-nodes.sh
15 | 	echo "0" > $PROGRESS_FILE
16 | 	STAGE=0
17 | fi
18 | 
19 | # installing CUDNN and Nvidia-driver
20 | if (( $STAGE < 1 )); then
21 | 	./01-install-cudnn-and-nvidia-driver.sh
22 | 	echo "1" > $PROGRESS_FILE
23 | 	sudo reboot
24 | fi
25 | 
26 | # installing docker
27 | if (( $STAGE < 2 )); then
28 | 	./02-install-docker.sh
29 | 	echo "2" > $PROGRESS_FILE
30 | 	sudo reboot
31 | fi
32 | 
33 | # installing nvidia docker - for testing docker and gpus
34 | if (( $STAGE < 3 )); then
35 | 	./03-install-nvidia-docker.sh
36 | 	echo "3" > $PROGRESS_FILE
37 | fi
38 | 
39 | # installing Kubernetes
40 | if (( $STAGE < 4 )); then
41 | 	./04-install-k8s.sh
42 | 	echo "4" > $PROGRESS_FILE
43 | 	sudo reboot
44 | fi
45 | 
46 | # configuring Kubernetes
47 | if (( $STAGE < 5 )); then
48 | 	./05-init-k8s-master-only.sh
49 | 	echo "5" > $PROGRESS_FILE
50 | 	sudo reboot
51 | fi
52 | 
53 | cd $ORG_DIR
54 | 


--------------------------------------------------------------------------------
/k8s_kubeflow_install/setup_for_gpu_node_worker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ORG_DIR=$PWD
 4 | PROGRESS_FILE="$PWD/worker_progress.stat"
 5 | 
 6 | chmod a+x common/*.sh
 7 | 
 8 | # create progress stat file
 9 | STAGE=$(<$PROGRESS_FILE)
10 | 
11 | cd common
12 | 
13 | if ! [ -e $PROGRESS_FILE ]; then
14 | 	./00-prepare-nodes.sh
15 | 	echo "0" > $PROGRESS_FILE
16 | 	STAGE=0
17 | fi
18 | 
19 | # installing CUDNN and Nvidia-driver
20 | if (( $STAGE < 1 )); then
21 | 	./01-install-cudnn-and-nvidia-driver.sh
22 | 	echo "1" > $PROGRESS_FILE
23 | 	sudo reboot
24 | fi
25 | 
26 | # installing docker
27 | if (( $STAGE < 2 )); then
28 | 	./02-install-docker.sh
29 | 	echo "2" > $PROGRESS_FILE
30 | 	sudo reboot
31 | fi
32 | 
33 | # installing nvidia docker - for testing docker and gpus
34 | if (( $STAGE < 3 )); then
35 | 	./03-install-nvidia-docker.sh
36 | 	echo "3" > $PROGRESS_FILE
37 | fi
38 | 
39 | # installing Kubernetes
40 | if (( $STAGE < 4 )); then
41 | 	./04-install-k8s.sh
42 | 	echo "4" > $PROGRESS_FILE
43 | 	sudo reboot
44 | fi
45 | 
46 | cd $ORG_DIR
47 | 


--------------------------------------------------------------------------------
/llama3_inference/README.md:
--------------------------------------------------------------------------------
 1 | # Llama3 8B inference examples
 2 | 
 3 | ### Basic version
 4 | 
 5 |     python3 llama3_inference_basic.py
 6 | 
 7 | ### Memory offload version (to free up GPU memory space, some layers are swapped with host memory)
 8 | 
 9 |     python3 llama3_inference_memory_offload.py
10 | 
11 | ### Required python packages
12 | 
13 |     pip3 install torch huggingface_hub transformers datasets bitsandbytes gradio pypdf accelerate
14 | 
15 | A Gradio-based web UI is provided, and the default configuration allows access at 127.0.0.1:7860.
16 | 
17 | We recommend using a GPU with more than 8GB of memory.
18 | 
19 | ## License
20 | 
21 | The results of the AIcomp project are distributed under the 3-clause BSD license.
22 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #To get the latest APEX
16 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.09-py3
17 | FROM ${FROM_IMAGE_NAME}
18 | 
19 | # Install dependencies
20 | RUN apt-get update \
21 |  && apt-get install -y --no-install-recommends \
22 |         bzip2 \
23 |         cabextract \
24 |         iputils-ping \
25 |         pbzip2 \
26 |         pv \
27 |  && rm -rf /var/lib/apt/lists/*
28 | 
29 | WORKDIR /workspace/bert
30 | COPY requirements.txt .
31 | RUN pip install --no-cache-dir -r requirements.txt
32 | #swsok, To enable download big files from google
33 | RUN pip install -U --no-cache-dir gdown --pre
34 | 
35 | # Preprocessing
36 | WORKDIR /workspace
37 | RUN cd /workspace && git clone https://github.com/attardi/wikiextractor.git
38 | RUN cd /workspace/wikiextractor && git checkout e4abb4cbd019b0257824ee47c23dd163919b731b
39 | 
40 | # Install BERT
41 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data
42 | WORKDIR /workspace/bert
43 | COPY . .
44 | 
45 | ENV PYTHONPATH "/workspace/bert"
46 | 
47 | RUN cd /workspace/bert/mhalib && python setup.py build && cp build/lib*/mhalib* ../
48 | WORKDIR /workspace/bert
49 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/NOTICE:
--------------------------------------------------------------------------------
1 | BERT PyTorch
2 | 
3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT
4 | licensed under the Apache License 2.0.
5 | 
6 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/README_2xa30_ngc22.09_pytorch.md:
--------------------------------------------------------------------------------
 1 | ## Steps to launch training on a single node with 2xA30
 2 | 
 3 | ### NVIDIA DGX single node
 4 | Launch configuration and system-specific hyperparameters for the NVIDIA A30
 5 | multi node submission are in the following scripts:
 6 | * for the 2xA30 1-node NVIDIA submission: `config_A30_1x2x224x14.sh`
 7 | 
 8 | Steps required to launch multi node training on NVIDIA 2xA30:
 9 | 
10 | 1. Build the container:
11 | 
12 | ```
13 | docker build --pull -t <docker/registry>/mlperf-nvidia:language_model .
14 | docker push <docker/registry>/mlperf-nvidia:language_model
15 | ```
16 | 
17 | 2. Launch the training:
18 | 
19 | 1-node NVIDIA 2xA30 training:
20 | 
21 | ```
22 | source config_A30_1x2x224x14.sh
23 | CONT=mlperf-nvidia:language_model DATADIR=<path/to/datadir> DATADIR_PHASE2=<path/to/datadir_phase2> EVALDIR=<path/to/evaldir> CHECKPOINTDIR=<path/to/checkpointdir> CHECKPOINTDIR_PHASE1=<path/to/checkpointdir_phase1 sbatch -N $DGXNNODES -t $WALLTIME run.sub
24 | ```


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/README_dgxa100_n512_ngc22.09_pytorch.md:
--------------------------------------------------------------------------------
 1 | ## Steps to launch training on multiple nodes
 2 | 
 3 | ### NVIDIA DGX A100 (multi node)
 4 | Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
 5 | multi node submission are in the following scripts:
 6 | * for the 512-node NVIDIA DGX A100 submission: `config_DGXA100_512x8x2x1_pack.sh`
 7 | 
 8 | Steps required to launch multi node training on NVIDIA DGX A100:
 9 | 
10 | 1. Build the container:
11 | 
12 | ```
13 | docker build --pull -t <docker/registry>/mlperf-nvidia:language_model .
14 | docker push <docker/registry>/mlperf-nvidia:language_model
15 | ```
16 | 
17 | 2. Launch the training:
18 | 
19 | 512-node NVIDIA DGX A100 training:
20 | 
21 | ```
22 | source config_DGXA100_512x8x2x1_pack.sh
23 | CONT=mlperf-nvidia:language_model DATADIR=<path/to/packed_datadir> DATADIR_PHASE2=<path/to/datadir_phase2> EVALDIR=<path/to/evaldir> CHECKPOINTDIR=<path/to/checkpointdir> CHECKPOINTDIR_PHASE1=<path/to/checkpointdir_phase1 sbatch -N $DGXNNODES -t $WALLTIME run.sub
24 | ```
25 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/README_dgxa100_n8_ngc22.09_pytorch.md:
--------------------------------------------------------------------------------
 1 | ## Steps to launch training on multiple nodes
 2 | 
 3 | ### NVIDIA DGX A100 (multi node)
 4 | Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
 5 | multi node submission are in the following scripts:
 6 | * for the 8-node NVIDIA DGX A100 submission: `config_DGXA100_8x8x48x1.sh`
 7 | 
 8 | Steps required to launch multi node training on NVIDIA DGX A100:
 9 | 
10 | 1. Build the container:
11 | 
12 | ```
13 | docker build --pull -t <docker/registry>/mlperf-nvidia:language_model .
14 | docker push <docker/registry>/mlperf-nvidia:language_model
15 | ```
16 | 
17 | 2. Launch the training:
18 | 
19 | 8-node NVIDIA DGX A100 training:
20 | 
21 | ```
22 | source config_DGXA100_8x8x48x1.sh
23 | CONT=mlperf-nvidia:language_model DATADIR=<path/to/datadir> DATADIR_PHASE2=<path/to/datadir_phase2> EVALDIR=<path/to/evaldir> CHECKPOINTDIR=<path/to/checkpointdir> CHECKPOINTDIR_PHASE1=<path/to/checkpointdir_phase1 sbatch -N $DGXNNODES -t $WALLTIME run.sub
24 | ```
25 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/README_dgxa100_ngc22.09_pytorch.md:
--------------------------------------------------------------------------------
 1 | ## Steps to launch training on a single node
 2 | 
 3 | ### NVIDIA DGX A100 (single node)
 4 | Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
 5 | multi node submission are in the following scripts:
 6 | * for the 1-node NVIDIA DGX A100 submission: `config_DGXA100_1x8x56x1.sh`
 7 | 
 8 | Steps required to launch multi node training on NVIDIA DGX A100:
 9 | 
10 | 1. Build the container:
11 | 
12 | ```
13 | docker build --pull -t <docker/registry>/mlperf-nvidia:language_model .
14 | docker push <docker/registry>/mlperf-nvidia:language_model
15 | ```
16 | 
17 | 2. Launch the training:
18 | 
19 | 1-node NVIDIA DGX A100 training:
20 | 
21 | ```
22 | source config_DGXA100_1x8x56x1.sh
23 | CONT=mlperf-nvidia:language_model DATADIR=<path/to/datadir> DATADIR_PHASE2=<path/to/datadir_phase2> EVALDIR=<path/to/evaldir> CHECKPOINTDIR=<path/to/checkpointdir> CHECKPOINTDIR_PHASE1=<path/to/checkpointdir_phase1 sbatch -N $DGXNNODES -t $WALLTIME run.sub
24 | ```
25 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/cleanup_scripts/create_pretraining_data_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | input_path=${1}
16 | output_dir="hdf5"
17 | 
18 | input_file=$(basename $input_path)
19 | 
20 | python3 ../create_pretraining_data.py \
21 |    --input_file=${input_path} \
22 |    --output_file="${output_dir}/${input_file}" \
23 |    --vocab_file=vocab.txt \
24 |    --do_lower_case=True \
25 |    --max_seq_length=512 \
26 |    --max_predictions_per_seq=76 \
27 |    --masked_lm_prob=0.15 \
28 |    --random_seed=12345 \
29 |    --dupe_factor=10
30 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/cleanup_scripts/create_wiki_test_set_md5_hashes.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | # Creates a text file containing md5sums from test set text file, to be used for verification of test set
15 | 
16 | import hashlib
17 | 
18 | filename = 'wiki_test_set.txt'
19 | ofilename = 'wiki_test_set_md5.txt'
20 | 
21 | with open(filename, mode='r', newline='\n') as ifile:
22 |   lines = ifile.read()
23 |   articles_in_file_tmp = lines.split('\n\n')
24 |   articles_in_file = []
25 |   for item in articles_in_file_tmp:
26 |     if item.rstrip() != '':
27 |       articles_in_file.append(item)
28 | 
29 | with open(ofilename, mode='w', newline='\n') as ofile:
30 |   for item in articles_in_file:
31 |     ofile.write(hashlib.md5(item.encode('utf-8')).hexdigest())
32 |     ofile.write('\n')
33 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/cleanup_scripts/dataset_stats.py:
--------------------------------------------------------------------------------
 1 | # NVIDIA
 2 | 
 3 | import h5py
 4 | import numpy as np
 5 | 
 6 | input_path = '/workspace/data_phase2/'
 7 | n_shards = 2048
 8 | 
 9 | input_files = [input_path + 'part_' + str(i) + '_of_' + str(n_shards) + '.hdf5' for i in range(n_shards)]
10 | 
11 | keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids', 'next_sentence_labels']
12 | 
13 | n_samples = int(0)
14 | n_real_tokens = int(0)
15 | n_real_mask = int(0)
16 | 
17 | alloc_size = 77000 * 2048  # A slight overestimate of n_samples
18 | 
19 | real_tokens = np.zeros((alloc_size), dtype=np.uint64)
20 | real_mask = np.zeros((alloc_size), dtype=np.uint64)
21 | 
22 | idx = int(0)
23 | for input_file in input_files:
24 |   print(input_file)
25 |   hfile = h5py.File(input_file, 'r')
26 |   
27 |   inputs = [np.asarray(hfile[key][:]) for key in keys]
28 |   n_samples_shard = int(inputs[5][:].shape[0])
29 |   n_tokens_per_seq = int(inputs[0].shape[1])
30 |   n_mask_per_seq = int(inputs[3].shape[1])
31 | 
32 |   for i in range(n_samples_shard):
33 |     curr_real_tokens = np.sum(inputs[1][i,:] > 0)
34 |     curr_real_mask = np.sum(inputs[3][i, :] > 0)
35 |     real_tokens[idx] = curr_real_tokens
36 |     real_mask[idx] = curr_real_mask
37 |     idx += 1
38 |   
39 |   n_samples += n_samples_shard
40 | 
41 |   hfile.close()
42 | 
43 | print('n_samples:,', n_samples)
44 | print('n_tokens_per_seq:', n_tokens_per_seq)
45 | print('n_mask_per_seq:', n_mask_per_seq)
46 | print('total n_pad_tokens:', np.sum(n_tokens_per_seq - real_tokens[:n_samples]))
47 | print('total n_pad_mask_tokens:', np.sum(n_mask_per_seq - real_mask[:n_samples]))
48 | print('mean pad tokens per seq:', np.mean(real_tokens[:n_samples]))
49 | print('mean pad masks per seq:', np.mean(real_mask[:n_samples]))
50 | 
51 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/cleanup_scripts/parallel_create_hdf5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cpus=$( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w )
16 | cpus=$((cpus / 2))
17 | echo "Using $cpus CPU cores"
18 | 
19 | mkdir -p hdf5/
20 | find -L results4/ -name "part*" | xargs --max-args=1 --max-procs=$cpus  ./create_pretraining_data_wrapper.sh
21 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/cleanup_scripts/process_wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 MLBenchmark Group. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | 
17 | # invocation script to cleanup the wiki dataset
18 | # Usage: ./process_wiki.sh <the wiki_?? files>
19 | # example: ./process_wiki.sh 'sample_data/wiki_??'
20 | # The resulted files will be placed in ./results
21 | 
22 | inputs=$1
23 | 
24 | pip install nltk
25 | 
26 | # Remove doc tag and title
27 | python ./cleanup_file.py --data=$inputs --output_suffix='.1'
28 | 
29 | # Further clean up files
30 | for f in ${inputs}; do
31 |   ./clean.sh ${f}.1 ${f}.2
32 | done
33 | 
34 | # Sentence segmentation
35 | python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3'
36 | 
37 | mkdir -p ./results
38 | 
39 | ## Choose file size method or number of packages by uncommenting only one of the following do_gather options
40 | # Gather into fixed size packages
41 | python ./do_gather.py --data=$inputs --input_suffix='.3' --block_size=26.92 --out_dir='./results'
42 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/cleanup_scripts/transparency_in_test_set_generation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import glob
15 | 
16 | output_filename = 'wiki_test_set.txt'
17 | 
18 | test_articles = []
19 | 
20 | file_glob = glob.glob('./results/part*', recursive=False)
21 | 
22 | with open(output_filename, mode='w', newline='\n') as ofile:
23 |   for filename in file_glob:
24 |     articles_in_file = []
25 |     with open(filename, mode='r', newline='\n') as ifile:
26 |       lines = ifile.read()
27 |       articles_in_file_tmp = lines.split('\n\n')
28 |       articles_in_file = []
29 |       for item in articles_in_file_tmp:
30 |         if item.rstrip() != '':
31 |           articles_in_file.append(item)
32 |     
33 |     target_article = min(42, len(articles_in_file) // 2)
34 |     test_articles.append(articles_in_file[target_article])
35 | 
36 |     with open(filename, mode='w', newline='\n') as ifile:
37 |       for article in articles_in_file[:target_article]:
38 |         ifile.write(article)
39 |         ifile.write('\n\n')
40 | 
41 |       for article in articles_in_file[target_article+1:]:
42 |         ifile.write(article)
43 |         ifile.write('\n\n')
44 | 
45 |   for article in test_articles:
46 |     ofile.write(article)
47 |     ofile.write('\n\n')
48 | 
49 | print("n_articles =", len(test_articles))
50 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_A30_1x2x224x14.sh:
--------------------------------------------------------------------------------
 1 | ## DL params
 2 | export BATCHSIZE=224
 3 | export GRADIENT_STEPS=14
 4 | export LR=3.7e-4
 5 | export MAX_SAMPLES_TERMINATION=20000000
 6 | export MAX_STEPS=7100
 7 | export OPT_LAMB_BETA_1=0.9
 8 | export OPT_LAMB_BETA_2=0.999
 9 | export START_WARMUP_STEP=0
10 | export WARMUP_PROPORTION=0.0
11 | 
12 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --dwu-group-size=2 --fused_bias_fc --fused_bias_mha --fused_dropout_add "
13 | export PHASE=2
14 | export EVAL_ITER_START_SAMPLES=150000
15 | export EVAL_ITER_SAMPLES=150000
16 | 
17 | ## System run parms
18 | export DGXNNODES=1
19 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
20 | export WALLTIME=04:00:00
21 | 
22 | ## System config params
23 | export DGXNGPU=2
24 | export DGXSOCKETCORES=64
25 | export DGXNSOCKET=2
26 | export DGXHT=2         # HT is on is 2, HT off is 1
27 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_A40_1x2x224x14.sh:
--------------------------------------------------------------------------------
 1 | ## DL params
 2 | export BATCHSIZE=224
 3 | export GRADIENT_STEPS=14
 4 | export LR=3.7e-4
 5 | export MAX_SAMPLES_TERMINATION=20000000
 6 | export MAX_STEPS=7100
 7 | export OPT_LAMB_BETA_1=0.9
 8 | export OPT_LAMB_BETA_2=0.999
 9 | export START_WARMUP_STEP=0
10 | export WARMUP_PROPORTION=0.0
11 | 
12 | #export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --dwu-group-size=2 --fused_bias_fc --fused_bias_mha --fused_dropout_add "
13 | export EXTRA_PARAMS="--dense_seq_output --unpad --exchange_padding --dwu-group-size=2 --fused_bias_fc --fused_dropout_add "
14 | export PHASE=2
15 | export EVAL_ITER_START_SAMPLES=150000
16 | export EVAL_ITER_SAMPLES=150000
17 | 
18 | ## System run parms
19 | export DGXNNODES=1
20 | #export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
21 | export DGXSYSTEM="A40_1x2x224x14"
22 | export WALLTIME=04:00:00
23 | 
24 | ## System config params
25 | export DGXNGPU=2
26 | export DGXSOCKETCORES=8
27 | export DGXNSOCKET=1
28 | export DGXHT=2         # HT is on is 2, HT off is 1
29 | 
30 | export CONT=swsok/mlperf-nvidia:language_model
31 | export DATADIR="/home/swosok/mlperf/bert/hdf5/training-4320/hdf5_4320_shards_varlength"
32 | export DATADIR_PHASE2="/home/swsok/mlperf/bert/hdf5/training-4320/hdf5_4320_shards_varlength"
33 | export EVALDIR="/home/swsok/mlperf/bert/hdf5/eval_varlength"
34 | export CHECKPOINTDIR_PHASE1="/home/swsok/mlperf/bert/phase1"
35 | export CHECKPOINTDIR="/home/swsok/mlperf/bert/checkpoints"
36 | export CUDA_VISIBLE_DEVICES="0,1"
37 | export NEXP=1
38 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_DGXA100_1x4x56x2.sh:
--------------------------------------------------------------------------------
 1 | ## DL params
 2 | export BATCHSIZE=112
 3 | export GRADIENT_STEPS=2
 4 | export LR=3.5e-4
 5 | export MAX_SAMPLES_TERMINATION=4500000
 6 | export MAX_STEPS=8041
 7 | export OPT_LAMB_BETA_1=0.9
 8 | export OPT_LAMB_BETA_2=0.999
 9 | export START_WARMUP_STEP=0
10 | export WARMUP_PROPORTION=0.0
11 | 
12 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --dwu-group-size=4 "
13 | export PHASE=2
14 | export EVAL_ITER_START_SAMPLES=150000
15 | export EVAL_ITER_SAMPLES=150000
16 | 
17 | ## System run parms
18 | export DGXNNODES=1
19 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
20 | export WALLTIME=01:15:00
21 | 
22 | ## System config params
23 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_4gpu_common.sh
24 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_DGXA100_1x8x56x1.sh:
--------------------------------------------------------------------------------
 1 | ## DL params
 2 | export BATCHSIZE=56
 3 | export GRADIENT_STEPS=1
 4 | export LR=0.000425
 5 | export MAX_SAMPLES_TERMINATION=4500000
 6 | export MAX_STEPS=6700
 7 | export OPT_LAMB_BETA_1=0.9
 8 | export OPT_LAMB_BETA_2=0.999
 9 | export START_WARMUP_STEP=0
10 | export WARMUP_PROPORTION=0.0
11 | export WEIGHT_DECAY_RATE=0.01
12 | export INIT_LOSS_SCALE=1024.0
13 | 
14 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --fused_bias_fc --fused_bias_mha --fused_dropout_add --fused_gemm_gelu "
15 | export PHASE=2
16 | export EVAL_ITER_START_SAMPLES=150000
17 | export EVAL_ITER_SAMPLES=150000
18 | 
19 | ## System run parms
20 | export DGXNNODES=1
21 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
22 | export WALLTIME_MINUTES=23
23 | export WALLTIME=$(( ${NEXP:-1} * ${WALLTIME_MINUTES} + 5 ))
24 | 
25 | ## System config params
26 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_common.sh
27 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_DGXA100_4gpu_common.sh:
--------------------------------------------------------------------------------
 1 | ## System config params
 2 | export DGXNGPU=4
 3 | export DGXSOCKETCORES=64
 4 | export DGXNSOCKET=2
 5 | export DGXHT=2         # HT is on is 2, HT off is 1
 6 | export SLURM_NTASKS=${DGXNGPU}
 7 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
 8 | 
 9 | ## Data Paths
10 | export DATADIR="/raid/datasets/bert/hdf5/4320_shards"
11 | export EVALDIR="/raid/datasets/bert/hdf5/eval_4320_shard"
12 | export DATADIR_PHASE2="/raid/datasets/bert/hdf5/4320_shards"
13 | export CHECKPOINTDIR="$CI_BUILDS_DIR/$SLURM_ACCOUNT/$CI_JOB_ID/ci_checkpoints"
14 | export RESULTSDIR="$CI_BUILDS_DIR/$SLURM_ACCOUNT/$CI_JOB_ID/results"
15 | #using existing checkpoint_phase1 dir
16 | export CHECKPOINTDIR_PHASE1="/raid/datasets/bert/checkpoints/checkpoint_phase1"
17 | export UNITTESTDIR="/lustre/fsw/mlperf/mlperft-bert/unit_test"
18 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_DGXA100_512x8x2x1_pack.sh:
--------------------------------------------------------------------------------
 1 | ## DL params
 2 | export BATCHSIZE=2
 3 | export GRADIENT_STEPS=1
 4 | export PACKING_FACTOR=2
 5 | export INIT_LOSS_SCALE=128.0
 6 | export LR=0.0033
 7 | export MAX_SAMPLES_TERMINATION=12000000
 8 | export MAX_STEPS=470
 9 | export OPT_LAMB_BETA_1=0.75
10 | export OPT_LAMB_BETA_2=0.9
11 | export START_WARMUP_STEP=-100
12 | export WEIGHT_DECAY_RATE=0.0166629
13 | export WARMUP_STEPS=290
14 | export SBATCH_NETWORK=sharp
15 | export NCCL_GRAPH_REGISTER=1
16 | export EXTRA_PARAMS="--use_cuda_graph --pad_fmha --cuda_graph_mode 'full_iteration' --max_iterations_per_graph 1 --fused_bias_fc --fused_bias_mha --fused_dropout_add --fused_bias_fc_loss_head --packed_samples "
17 | export PHASE=2
18 | 
19 | ## System run parms
20 | export DGXNNODES=512
21 | 
22 | # hparams that depend on number of nodes
23 | export EVAL_ITER_START_SAMPLES=325000 #$(echo "25000*(0.05*(230.23*${BATCHSIZE}*${DGXNNODES}*8*${PACKING_FACTOR}+3000000)/25000)" | bc)
24 | export EVAL_ITER_SAMPLES=${EVAL_ITER_START_SAMPLES}
25 | 
26 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
27 | export WALLTIME_MINUTES=7
28 | 
29 | export WALLTIME=$(( ${NEXP:-1} * ${WALLTIME_MINUTES} + 5 ))
30 | 
31 | ## System config params
32 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_common.sh
33 | export DATADIR_PHASE2="/raid/datasets/bert/hdf5/4320_packed_shards"
34 | 
35 | export CONTAINER_PRELOAD_LUSTRE=1
36 | export USE_DDP=1
37 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_DGXA100_8x8x48x1.sh:
--------------------------------------------------------------------------------
 1 | ## DL params
 2 | export BATCHSIZE=48
 3 | export GRADIENT_STEPS=1
 4 | export LR=0.0020992
 5 | export MAX_SAMPLES_TERMINATION=4500000
 6 | export MAX_STEPS=1059
 7 | export OPT_LAMB_BETA_1=0.60466
 8 | export OPT_LAMB_BETA_2=0.85437
 9 | export START_WARMUP_STEP=0
10 | export WARMUP_STEPS=0
11 | export WEIGHT_DECAY_RATE=0.1
12 | export INIT_LOSS_SCALE=4096.0
13 | 
14 | export SBATCH_NETWORK=sharp
15 | export EXTRA_PARAMS="--dense_seq_output --unpad --unpad_fmha --exchange_padding --fused_bias_fc --fused_bias_mha --fused_dropout_add  --fused_gemm_gelu "
16 | export PHASE=2
17 | export EVAL_ITER_START_SAMPLES=175000
18 | export EVAL_ITER_SAMPLES=175000
19 | 
20 | ## System run parms
21 | export DGXNNODES=8
22 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
23 | export WALLTIME_MINUTES=15
24 | export WALLTIME=$(( ${NEXP:-1} * ${WALLTIME_MINUTES} + 5 ))
25 | 
26 | ## System config params
27 | source $(dirname ${BASH_SOURCE[0]})/config_DGXA100_common.sh
28 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/config_DGXA100_common.sh:
--------------------------------------------------------------------------------
1 | ## System config params
2 | export DGXNGPU=8
3 | export DGXSOCKETCORES=64
4 | export DGXNSOCKET=2
5 | export DGXHT=2         # HT is on is 2, HT off is 1
6 | export SLURM_NTASKS=${DGXNGPU}
7 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/create_pretraining_data_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
16 | 
17 | INPUT=${1}
18 | OUTPUT=${2}/$(basename $INPUT)
19 | VOCAB=${3}
20 | 
21 | python3 ${SCRIPT_DIR}/create_pretraining_data.py \
22 |    --input_file=${INPUT} \
23 |    --output_file=${OUTPUT} \
24 |    --vocab_file=${VOCAB} \
25 |    --do_lower_case=True \
26 |    --max_seq_length=512 \
27 |    --max_predictions_per_seq=76 \
28 |    --masked_lm_prob=0.15 \
29 |    --random_seed=12345 \
30 |    --dupe_factor=10
31 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/do_sentence_segmentation.py:
--------------------------------------------------------------------------------
 1 | """Script for sentence segmentation.
 2 | 
 3 | Copied and modified from https://github.com/eric-haibin-lin/text-proc.git
 4 | """
 5 | import argparse
 6 | import glob
 7 | import io
 8 | import logging
 9 | import multiprocessing
10 | import os
11 | import time
12 | import nltk
13 | 
14 | from nltk.tokenize import sent_tokenize
15 | 
16 | parser = argparse.ArgumentParser(
17 |     description='Sentence segmentation for BERT documents.')
18 | parser.add_argument(
19 |     '--data',
20 |     type=str,
21 |     default='./*/*.compact',
22 |     help='Input files. Default is "./*/*.compact"')
23 | parser.add_argument(
24 |     '--input_suffix',
25 |     type=str,
26 |     default='.2',
27 |     help='Suffix for input files. Default is ".2"')
28 | parser.add_argument(
29 |     '--output_suffix',
30 |     type=str,
31 |     default='.3',
32 |     help='Suffix for output files. Default is ".3"')
33 | parser.add_argument(
34 |     '--nworker',
35 |     type=int,
36 |     default=72,
37 |     help='Number of workers for parallel processing.')
38 | args = parser.parse_args()
39 | 
40 | # download package
41 | nltk.download('punkt')
42 | 
43 | # arguments
44 | input_files = sorted(glob.glob(os.path.expanduser(args.data)))
45 | num_files = len(input_files)
46 | num_workers = args.nworker
47 | logging.basicConfig(level=logging.INFO)
48 | logging.info('Number of input files to process = %d', num_files)
49 | 
50 | 
51 | def process_one_file(one_input):
52 |   """Separate paragraphs into sentences, for one file."""
53 |   input_filename = one_input + args.input_suffix
54 |   output_filename = one_input + args.output_suffix
55 |   logging.info('Processing %s => %s', input_filename, output_filename)
56 |   with io.open(input_filename, 'r', encoding='utf-8') as fin:
57 |     with io.open(output_filename, 'w', encoding='utf-8') as fout:
58 |       for line in fin:
59 |         if len(line) == 1:
60 |           fout.write(u'\n')
61 |         sents = sent_tokenize(line)
62 |         for sent in sents:
63 |           sent_str = sent.strip()
64 |           # if sent_str:
65 |           fout.write('%s\n' % sent_str)
66 |       fout.write(u'\n')
67 | 
68 | 
69 | if __name__ == '__main__':
70 |   tic = time.time()
71 |   p = multiprocessing.Pool(num_workers)
72 |   p.map(process_one_file, input_files)
73 |   toc = time.time()
74 |   logging.info('Processed %s in %.2f sec', args.data, toc - tic)
75 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/eval_varlength.chk:
--------------------------------------------------------------------------------
1 | part_eval_10k.hdf5	611d8bae26646145e1c33338a27ba124
2 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/hdf5_md5.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import h5py
 3 | import numpy as np
 4 | import hashlib
 5 | import os
 6 | 
 7 | # Exmaple usage:
 8 | # python3 tfrecord_md5sum.py --input_tfrecord=eval_10k --output_md5sum=eval_shard.md5
 9 | 
10 | parser = argparse.ArgumentParser(
11 |     description="HDF5 variable length to MD5sums for BERT.")
12 | parser.add_argument(
13 |     '--input_hdf5',
14 |     type=str,
15 |     required=True,
16 |     help='Input tfrecord path')
17 | args = parser.parse_args()
18 | 
19 | 
20 | if __name__ == '__main__':
21 | 
22 |   h = hashlib.md5
23 | 
24 | row_sums=[]
25 | f = h5py.File(args.input_hdf5, 'r')
26 | for i in range(f['input_ids'].shape[0]):
27 |     row_sums.append(h(str(f['input_ids'][i].tolist()).encode('utf-8')).hexdigest())
28 | f.close()
29 | print("{}\t{}".format(os.path.basename(args.input_hdf5), h(str(row_sums).encode('utf-8')).hexdigest()))


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/packed_data/README.md:
--------------------------------------------------------------------------------
 1 | # Download and prepare the data
 2 | 
 3 | Building the Docker container
 4 | ```shell
 5 | docker build --pull -t <docker/registry>/mlperf-nvidia:language_model .
 6 | docker push <docker/registry>/mlperf-nvidia:language_model
 7 | ```
 8 | 
 9 | Go through standard data preparation upt to the moment where you have unpacked "results4" sources
10 | 
11 | Assuming /data/mlperf/bert/ contains 'results4' and 'phase1' directories
12 | 
13 | Start the container interactively, mounting the directory you want to store the expieriment data as `/workspace/bert_data`
14 | ```
15 | docker run -it --runtime=nvidia --ipc=host (...) -v /data/mlperf/bert:/workspace/bert_data mlperf-nvidia:language_model
16 | ```
17 | 
18 | To prepare the packed version of data, we need first to group the trainig sequences be lenght (as number of valid tokens). To easily parallelize the process each shard is processed separately and at the end results are merged.
19 | ```
20 | mkdir -p /workspace/bert_data/per_seqlen_parts
21 | for shard in `seq -w 00000 00499`; do
22 |     mkdir -p /workspace/bert_data/per_seqlen_parts/part-${shard}
23 | done
24 | ```
25 | 
26 | Parallelize over $CPUS cores
27 | ```
28 | CPUS=64
29 | seq -w 00000 00499 | xargs --max-args=1 --max-procs=$CPUS -I{} python create_per_seqlength_data.py --input_file ../download/results4/part-{}-of-00500 --output_file ./per_seqlen/part_{} --vocab_file ../phase1/vocab.txt --do_lower_case=True    --max_seq_length=512    --max_predictions_per_seq=76    --masked_lm_prob=0.15    --random_seed=12345    --dupe_factor=10
30 | ```
31 | 
32 | Merge all results
33 | ```
34 | mkdir -p /workspace/bert_data/per_seqlen
35 | seq 0 511 | xargs --max-args=1 --max-procs=$CPUS -I{} python ./gather_per_seqlength_data.py --input_hdf5 /workspace/bert_data/per_seqlen_parts --output_hdf5 /workspace/bert_data/per_seqlen --seq_length ${}
36 | ```
37 | 
38 | Generate sub-optimal packing strategy based on lenghts distribution of training set and store samples-based lists per shard
39 | ```
40 | python ./generate_packing_strategy.py --input_hdf5 /workspace/bert_data/per_seqlen --output_hdf5 /workspace/bert_data/packed_data --max_seq_length 512 --max_seq_per_sample 3 --shards_num 4320 
41 | ```
42 | 
43 | Create training set shards based on generated lists
44 | ```
45 | python create_packed_trainset.py --input_hdf5 /workspace/bert_data/per_seqlen --assignment_file /workspace/bert_data/packed_data --output_hdf5 /workspace/bert_data/packed_data
46 | ```
47 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/process_wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # invocation script to cleanup the wiki dataset
 4 | # Usage: ./process_wiki.sh <the wiki_?? files>
 5 | # example: ./process_wiki.sh 'sample_data/wiki_??'
 6 | # The resulted files will be placed in ./results
 7 | 
 8 | inputs=$1
 9 | 
10 | pip install nltk
11 | 
12 | # Remove doc tag and title
13 | # python ./cleanup_file.py --data=$inputs --output_suffix='.1'
14 | 
15 | # Further clean up files
16 | # for f in ${inputs}; do
17 | #   ./clean.sh ${f}.1 ${f}.2
18 | # done
19 | 
20 | # Sentence segmentation
21 | # python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3'
22 | 
23 | mkdir -p ./results
24 | 
25 | # Train/Eval seperation
26 | python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output='./results/eval'
27 | 
28 | ## Choose file size method or number of packages by uncommenting only one of the following do_gather options
29 | # Gather into fixed size packages
30 | python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir='./results'
31 | 
32 | # Gather into fixed number of packages
33 | #NUM_PACKAGES=512
34 | #python ./do_gather.py --data=$inputs --input_suffix='.3' --num_outputs=$NUM_PACKAGES --out_dir='./results'
35 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/input_preprocessing/shuffle_samples.py:
--------------------------------------------------------------------------------
 1 | from typing import OrderedDict
 2 | import h5py
 3 | import numpy as np
 4 | import argparse
 5 | import logging
 6 | from tqdm import tqdm
 7 | from itertools import repeat, cycle
 8 | import json
 9 | import glob
10 | import random
11 | 
12 | logging.basicConfig(level=logging.INFO)
13 | parser = argparse.ArgumentParser(
14 |     description="Training data sharding for BERT.")
15 | parser.add_argument(
16 |     '--input_hdf5',
17 |     type=str,
18 |     default='hdf5',
19 |     help='Input hdf5_file path')
20 | parser.add_argument(
21 |     '--output_hdf5',
22 |     type=str,
23 |     default='',
24 |     help='Output hdf5_dir path')
25 | args = parser.parse_args()
26 | 
27 | 
28 | input_files = sorted(glob.glob(args.input_hdf5 + '/part_*.hdf5', recursive=False))
29 | num_shards = len(input_files)
30 | logging.info('n_input_shards = {}'.format(num_shards))
31 | 
32 | ifile_handles={}
33 | for ifile_idx in tqdm(range(num_shards)):
34 |     handle = h5py.File(f'{input_files[ifile_idx]}', 'r')
35 |     print(handle.keys())
36 |     ifile_handles[ifile_idx] = [
37 |         handle['input_ids'][:],
38 |         handle['input_mask'][:],
39 |         handle['segment_ids'][:],
40 |         handle['masked_lm_positions'][:],
41 |         handle['masked_lm_ids'][:],
42 |         handle['next_sentence_labels'][:]
43 |     ]
44 |     handle.close()
45 | 
46 | ind=[(i, j) for idx in range(num_shards) for i, j in zip(cycle([idx]), list(range(ifile_handles[idx][0].shape[0]))) ]
47 | random.shuffle(ind)
48 | 
49 | # dumps per shard sample indexes
50 | master_sample_idx = 0
51 | for ofile_idx in tqdm(range(num_shards)):
52 |     n_samples_in_this_shard = ifile_handles[ofile_idx][0].shape[0]
53 |     idxs=ind[master_sample_idx:master_sample_idx+n_samples_in_this_shard]
54 |     with open(f'{args.output_hdf5}/shard_list_{ofile_idx:05}.lst','w') as f:
55 |         f.write(json.dumps(idxs))
56 |     #swsok, this line is omitted.
57 |     master_sample_idx = master_sample_idx+n_samples_in_this_shard
58 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/mhalib/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022 NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import setuptools
17 | from setuptools import setup
18 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
19 | 
20 | setup(
21 |     name='mhalib',
22 |     ext_modules=[
23 |         CUDAExtension(
24 |             name='mhalib',
25 |             sources=['mha_funcs.cu'],
26 |             extra_compile_args={
27 |                                'cxx': ['-O3',],
28 |                                 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode=arch=compute_70,code=sm_70','-gencode=arch=compute_80,code=sm_80','-gencode=arch=compute_80,code=compute_80']
29 |                                }
30 |             )
31 |     ],
32 |     cmdclass={
33 |         'build_ext': BuildExtension
34 | })
35 | 
36 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/mlperf_logger.py:
--------------------------------------------------------------------------------
1 | # import collections
2 | # import os
3 | # import subprocess
4 | 
5 | import torch
6 | from mlperf_common.logging import MLLoggerWrapper
7 | from mlperf_common.frameworks.pyt import PyTCommunicationHandler
8 | 
9 | mllogger = MLLoggerWrapper(PyTCommunicationHandler())


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/__init__.py


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | from .activations import bias_gelu_impl
4 | 
5 | __all__ = ["bias_gelu_impl"]
6 | 
7 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/layers/attention.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/layers/attention.py


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/layers/fused.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import mlp_cuda
 3 | from torch import nn
 4 | from apex import amp
 5 | 
 6 | #implements fused GEMM+bias in forward pass using mlp_cuda from apex
 7 | class FusedMlpFunction(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input, weight, bias):
10 |         ctx.save_for_backward(input, weight)
11 |         output = mlp_cuda.forward(True, 0, (input, weight, bias))
12 |         return output[0]
13 | 
14 |     @staticmethod
15 |     def backward(ctx, grad_output):
16 |         input, weight = ctx.saved_tensors
17 |         grad_input = grad_output.mm(weight)
18 |         grad_weight = grad_output.t().mm(input)
19 |         grad_bias = torch.sum(grad_output, dim=0)
20 |         return grad_input, grad_weight, grad_bias
21 | 
22 | mlp_function = amp.half_function(FusedMlpFunction.apply)
23 | 
24 | class FusedMlp(nn.Module):
25 |     def __init__(self, in_features, out_features, bias=True):
26 |         super(FusedMlp, self).__init__()
27 |         self.in_features = in_features
28 |         self.out_features = out_features
29 |         self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
30 |         if bias:
31 |             self.bias = nn.Parameter(torch.Tensor(out_features))
32 |         else:
33 |             self.register_parameter('bias', None)
34 | 
35 |     def forward(self, input):
36 |         return mlp_function(input, self.weight, self.bias)
37 | 
38 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/layers/layernorm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 3 | # Copyright (c) 2018-2022, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import torch
19 | from torch import nn
20 | 
21 | try:
22 |   import apex
23 |   #apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
24 |   import apex.normalization
25 |   #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
26 |   from apex.contrib.layer_norm import FastLayerNorm
27 |   BertLayerNorm = FastLayerNorm
28 | except ImportError:
29 |   print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
30 |   class BertLayerNorm(nn.Module):
31 |     def __init__(self, hidden_size, eps=1e-12):
32 |       """Construct a layernorm module in the TF style (epsilon inside the square root).
33 |       """
34 |       super(BertLayerNorm, self).__init__()
35 |       self.weight = nn.Parameter(torch.ones(hidden_size))
36 |       self.bias = nn.Parameter(torch.zeros(hidden_size))
37 |       self.variance_epsilon = eps
38 | 
39 |     def forward(self, x):
40 |       u = x.mean(-1, keepdim=True)
41 |       s = (x - u).pow(2).mean(-1, keepdim=True)
42 |       x = (x - u) / torch.sqrt(s + self.variance_epsilon)
43 |       return self.weight * x + self.bias
44 | 
45 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/losses/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/losses/__init__.py


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/model/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/model/models/__init__.py


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/mounts.txt:
--------------------------------------------------------------------------------
 1 | ${DATADIR}:/workspace/data
 2 | ${DATADIR_PHASE2}:/workspace/data_phase2
 3 | ${CHECKPOINTDIR_PHASE1}:/workspace/phase1
 4 | ${EVALDIR}:/workspace/evaldata
 5 | #${UNITTESTDIR}:/workspace/unit_test_data
 6 | #
 7 | ${PWD}/run_pretraining.py:/workspace/bert/run_pretraining.py
 8 | ${PWD}/run_and_time.sh:/workspace/bert/run_and_time.sh
 9 | ${CHECKPOINTDIR}:/workspace/checkpoints
10 | #${PWD}/bert_config_no-dp.json:/workspace/phase1/bert_config.json
11 | #${PWD}/modeling.py:/workspace/bert/modeling.py
12 | #${PWD}/fwd_loss_bwd_trainer.py:/workspace/bert/fwd_loss_bwd_trainer.py
13 | #${PWD}/fmha.py:/workspace/bert/fmha.py
14 | #${PWD}/distributed_fused_lamb.py:/opt/conda/lib/python3.8/site-packages/apex/contrib/optimizers/distributed_fused_lamb.py
15 | #${PWD}/fmhalib.cpython-38-x86_64-linux-gnu.so-cond:/opt/conda/lib/python3.8/site-packages/fmhalib.cpython-38-x86_64-linux-gnu.so
16 | 
17 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | boto3==1.14.0
 3 | gdown==4.4.0
 4 | git+https://github.com/mlcommons/logging.git@2.1.0-rc1
 5 | h5py==2.10.0
 6 | html2text==2020.1.16
 7 | ipdb==0.13.2
 8 | nltk==3.5
 9 | onnxruntime==1.3.0
10 | parameterized
11 | progressbar==2.5
12 | requests==2.23.0
13 | six==1.15.0
14 | tensorflow==2.2.0
15 | git+https://github.com/NVIDIA/mlperf-common.git@training-v2.1-rc0
16 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m torch.distributed.launch --nproc_per_node=8 \
 4 |     -u /workspace/bert/run_pretraining.py \
 5 |     --seed=42 \
 6 |     --do_train \
 7 |     --target_accuracy=0.714 \
 8 |     --accuracy_score_averaging=1 \
 9 |     --config_file=/workspace/phase1/bert_config.json \
10 |     --skip_checkpoint \
11 |     --output_dir=/results \
12 |     --fp16 \
13 |     --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
14 |     --gradient_accumulation_steps=1 \
15 |     --log_freq=1 \
16 |     --train_batch_size=4 \
17 |     --learning_rate=4e-5 \
18 |     --warmup_proportion=1.0 \
19 |     --input_dir=/workspace/data_phase2 \
20 |     --phase2 \
21 |     --max_seq_length=512 \
22 |     --max_predictions_per_seq=76 \
23 |     --max_steps=100 \
24 |     --init_checkpoint=/workspace/phase1/model.ckpt-28252 \
25 | 
26 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/unit_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/mlperf/pytorch-22.09/unit_test/__init__.py


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/unit_test/global_vars.py:
--------------------------------------------------------------------------------
 1 | # NVIDIA
 2 | 
 3 | # The purpose of this module is to provide a space to allow global variables to work properly inside of the unittest framework
 4 | 
 5 | # An example case is loading the TF weights file.
 6 | ## It is ~4GB and read-only, so why not just load it once into a global variable
 7 | 
 8 | tf_weights = None
 9 | tf_tensors = None
10 | 
11 | pyt_model = None
12 | pyt_checkpoint = None
13 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/unit_test/test_data_path.py:
--------------------------------------------------------------------------------
 1 | # NVIDIA
 2 | 
 3 | import os
 4 | 
 5 | expected_data_path = '/workspace/unit_test_data'
 6 | 
 7 | def data_path_found():
 8 |   return os.path.isdir(expected_data_path)
 9 | 
10 | def get_path():
11 |   if data_path_found():
12 |     return expected_data_path
13 |   else:
14 |     raise ValueError('Unit test data not found - missing or not mounted correctly.')
15 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/unit_test/test_main.py:
--------------------------------------------------------------------------------
 1 | # NVIDIA
 2 | 
 3 | import unittest
 4 | 
 5 | from test_bert_batch_1 import *
 6 | #from test_bert_batch_7 import *
 7 | from test_embeddings_batch_1 import *
 8 | from test_encoders_batch_1 import *
 9 | 
10 | if __name__ == '__main__':
11 |   unittest.main(verbosity=2)
12 | 


--------------------------------------------------------------------------------
/mlperf/pytorch-22.09/unit_test/unit_test_utils.py:
--------------------------------------------------------------------------------
 1 | # NVIDIA
 2 | 
 3 | import numpy as np
 4 | 
 5 | # Fractions of max absolute difference
 6 | bins_relative = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
 7 | 
 8 | def max_abs_diff_binning(input_a, input_b):
 9 |   abs_diff = np.abs(input_a - input_b)
10 |   max_abs_diff = np.max(abs_diff)
11 |   max_idx = np.argmax(abs_diff)
12 | 
13 |   counts, bins_absolute = np.histogram(abs_diff, np.array(bins_relative) * max_abs_diff)
14 |   return counts, bins_absolute, bins_relative, max_idx
15 | 
16 | def pyt_tf_mapping(pyt_state_dict, add_prefix=''):
17 |   pyt_strings = [x for x in pyt_state_dict.keys()]
18 |   converted_strings = [add_prefix + x.replace('.', '/').replace('weight', 'kernel') for x in pyt_strings]
19 |   
20 |   for idx, item in enumerate(converted_strings):
21 |     if 'LayerNorm' in item:
22 |       item = item.replace('kernel', 'gamma').replace('bias', 'beta')
23 |     elif 'embedding' in item:
24 |       item = item.replace('/kernel', '')
25 |     
26 |     if 'layer/' in item:
27 |       item = item.replace('layer/', 'layer_')
28 | 
29 |     if 'cls/' in item and 'decoder' not in item and 'dense' not in item:
30 |       item = item.replace('bias', 'output_bias')
31 |       item = item.replace('kernel', 'output_weights')
32 | 
33 |     if 'decoder' in item:
34 |       item = item.replace('predictions/decoder', 'predictions/transform/dense')
35 |       #item = item.replace('predictions/decoder', 'seq_relationship')
36 |     # Add additional rules here
37 | 
38 |     converted_strings[idx] = item
39 | 
40 |   return dict(zip(pyt_state_dict.keys(), converted_strings))
41 | 


--------------------------------------------------------------------------------
/opt_prime/opt_prime/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ai-computing/aicomp/da109c8c246c71e7f671d060dcd6746e6c0ee28e/opt_prime/opt_prime/__init__.py


--------------------------------------------------------------------------------
/torchgpipe_OOO_PP/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022-present, ETRI, All rights reserved.
 4 | 
 5 | From PyTorch:
 6 | Copyright (c) 2014-     Facebook, Inc, All rights reserved.
 7 | 
 8 | From torchgpipe:
 9 | Copyright (c) 2019-2020, Kakao Brain,  All rights reserved.
10 | 
11 | 
12 | Redistribution and use in source and binary forms, with or without
13 | modification, are permitted provided that the following conditions are met:
14 | 
15 | 1. Redistributions of source code must retain the above copyright
16 |    notice, this list of conditions and the following disclaimer.
17 | 
18 | 2. Redistributions in binary form must reproduce the above copyright
19 |    notice, this list of conditions and the following disclaimer in the
20 |    documentation and/or other materials provided with the distribution.
21 | 
22 | 3. Neither the name of the copyright holder nor the names of its
23 |    contributors may be used to endorse or promote products derived from this
24 |    software without specific prior written permission.
25 | 
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 | POSSIBILITY OF SUCH DAMAGE.
37 | 


--------------------------------------------------------------------------------
/torchgpipe_OOO_PP/README.md:
--------------------------------------------------------------------------------
 1 | ## Pipeline parallelization based on out of order technology
 2 | 
 3 | We present the result of developing PoC that applies out of order technology (https://dl.acm.org/doi/pdf/10.1145/3492321.3519563) to pipeline parallelization.
 4 | 
 5 | ## Usage
 6 | 
 7 | This SW requires:
 8 | * Python3
 9 | * Pytorch 1.12+
10 | * torchgpipe 0.0.7
11 | 
12 | ## RUN
13 | 
14 | python3 source.py
15 | 
16 | ## License
17 | 
18 | The results of the AIcomp project are distributed under the 3-clause BSD license.
19 | 


--------------------------------------------------------------------------------