├── .github └── workflows │ └── formatting.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── applications ├── DeepSpeed-Chat │ ├── .gitignore │ ├── README.md │ ├── assets │ │ ├── image │ │ │ ├── 1.3B-breakdown.png │ │ │ ├── Banner-benchmark.png │ │ │ ├── RLHF.png │ │ │ ├── democrat2.png │ │ │ ├── ds-chat-single.gif │ │ │ ├── ds-chat.gif │ │ │ ├── ds-shiba.png │ │ │ ├── e2e_RLHF.png │ │ │ ├── four_blocks.png │ │ │ ├── ppo_trainer.png │ │ │ ├── reward_function.png │ │ │ └── shiba.png │ │ └── video │ │ │ └── release_v3.mp4 │ ├── chat.py │ ├── dschat │ │ ├── rlhf │ │ │ ├── ppo_trainer.py │ │ │ └── rlhf_engine.py │ │ └── utils │ │ │ ├── data │ │ │ ├── data_utils.py │ │ │ └── raw_datasets.py │ │ │ ├── ds_utils.py │ │ │ ├── model │ │ │ ├── model_utils.py │ │ │ └── reward_model.py │ │ │ ├── module │ │ │ └── lora.py │ │ │ ├── perf.py │ │ │ └── utils.py │ ├── e2e_rlhf.py │ ├── inference │ │ └── chatbot.py │ ├── requirements.txt │ ├── setup.py │ ├── tests │ │ └── test_training.py │ └── training │ │ ├── README.md │ │ ├── step1_supervised_finetuning │ │ ├── README.md │ │ ├── evaluation_scripts │ │ │ └── run_prompt.sh │ │ ├── main.py │ │ ├── prompt_eval.py │ │ ├── training_log_output │ │ │ └── opt-1.3b-globalBatchSize128.log │ │ └── training_scripts │ │ │ ├── README.md │ │ │ ├── llama2 │ │ │ ├── run_llama2_7b.sh │ │ │ └── run_llama2_7b_lora.sh │ │ │ ├── opt │ │ │ ├── multi_node │ │ │ │ └── run_66b.sh │ │ │ ├── single_gpu │ │ │ │ ├── run_1.3b.sh │ │ │ │ └── run_6.7b_lora.sh │ │ │ └── single_node │ │ │ │ ├── run_1.3b.sh │ │ │ │ ├── run_1.3b_lora.sh │ │ │ │ ├── run_13b.sh │ │ │ │ ├── run_30b_lora.sh │ │ │ │ ├── run_6.7b.sh │ │ │ │ └── sweep │ │ │ │ ├── README.md │ │ │ │ ├── run_single.sh │ │ │ │ └── run_step1_sweep.sh │ │ │ └── other_language │ │ │ ├── run_chinese.sh │ │ │ └── run_japanese.sh │ │ ├── step2_dpo_finetuning │ │ ├── README.md │ │ ├── main.py │ │ ├── training_log_output │ │ │ └── opt-350M_globalBatchSize-32.log │ │ └── training_scripts │ │ │ ├── README.md │ │ │ ├── llama2 │ │ │ ├── run_llama2_7b.sh │ │ │ └── run_llama2_7b_lora.sh │ │ │ └── opt │ │ │ ├── multi_node │ │ │ └── run_350m.sh │ │ │ ├── single_gpu │ │ │ └── run_350m.sh │ │ │ └── single_node │ │ │ ├── run_350m.sh │ │ │ └── sweep │ │ │ ├── README.md │ │ │ ├── run_single.sh │ │ │ └── run_step2_sweep.sh │ │ ├── step2_reward_model_finetuning │ │ ├── README.md │ │ ├── evaluation_scripts │ │ │ └── run_eval.sh │ │ ├── main.py │ │ ├── rw_eval.py │ │ ├── training_log_output │ │ │ └── opt-350m_globalBatchSize-64.log │ │ └── training_scripts │ │ │ ├── README.md │ │ │ ├── llama2 │ │ │ ├── run_llama2_7b.sh │ │ │ └── run_llama2_7b_lora.sh │ │ │ └── opt │ │ │ ├── multi_node │ │ │ └── run_350m.sh │ │ │ ├── single_gpu │ │ │ └── run_350m.sh │ │ │ └── single_node │ │ │ ├── run_350m.sh │ │ │ └── sweep │ │ │ ├── README.md │ │ │ ├── run_single.sh │ │ │ └── run_step2_sweep.sh │ │ └── step3_rlhf_finetuning │ │ ├── BenckmarkSetting.md │ │ ├── README.md │ │ ├── main.py │ │ ├── training_log_output │ │ └── actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log │ │ └── training_scripts │ │ ├── README.md │ │ ├── llama2 │ │ ├── run_llama2_7b.sh │ │ ├── run_llama2_7b_lora.sh │ │ └── run_llama2_7b_mixz.sh │ │ └── opt │ │ ├── multi_node │ │ └── run_66b.sh │ │ ├── single_gpu │ │ ├── run_1.3b.sh │ │ └── run_6.7b_lora.sh │ │ └── single_node │ │ ├── run_1.3b.sh │ │ ├── run_1.3b_lora.sh │ │ ├── run_13b.sh │ │ ├── run_30b_lora.sh │ │ ├── run_6.7b.sh │ │ └── sweep │ │ ├── README.md │ │ ├── run_single.sh │ │ └── run_step3_sweep.sh └── DeepSpeed-VisualChat │ ├── README.md │ ├── assets │ ├── banner.png │ ├── ceos.png │ ├── friends.png │ ├── hero-figure.png │ └── model.png │ ├── chat │ ├── README.md │ ├── chat.py │ └── chat_scripts │ │ └── run.sh │ ├── eval │ ├── README.md │ ├── batch_generation.py │ ├── eval_data │ │ ├── eval_comprehensive.json │ │ ├── eval_robustness.json │ │ ├── eval_single.json │ │ └── images │ │ │ ├── cats │ │ │ ├── 1806905748_adb926a0a0.jpg │ │ │ ├── british_shorthair.jpg │ │ │ └── cat.png │ │ │ ├── friends │ │ │ ├── can-count1.jpg │ │ │ ├── can-count2.jpg │ │ │ ├── wrong-count1.jpg │ │ │ └── wrong-count2.jpg │ │ │ ├── singles │ │ │ ├── 1.jpg │ │ │ ├── 2.jpg │ │ │ ├── 202160027_b319c4166e.jpg │ │ │ ├── 50.jpg │ │ │ ├── extreme_ironing.jpg │ │ │ └── waterview.jpg │ │ │ ├── tech-ceo │ │ │ ├── gate1.jpg │ │ │ ├── jobs1.jpg │ │ │ └── musk1.jpg │ │ │ └── zootopia │ │ │ ├── z1.png │ │ │ ├── z2.png │ │ │ ├── z2a.png │ │ │ └── z3.png │ ├── eval_scripts │ │ └── run_batch.sh │ └── results │ │ ├── eval_comprehensive │ │ ├── ours-set1_best_eval.csv │ │ ├── ours-set1_final.csv │ │ ├── ours-set2_best_eval.csv │ │ └── ours-set2_final.csv │ │ ├── eval_robustness │ │ ├── ours-set1_best_eval.csv │ │ ├── ours-set1_final.csv │ │ ├── ours-set2_best_eval.csv │ │ └── ours-set2_final.csv │ │ └── eval_single │ │ ├── ours-single_best_eval.csv │ │ └── ours-single_final.csv │ ├── helper │ ├── README.md │ ├── extract_qwen_vl.py │ └── qwen_clip │ │ ├── config.json │ │ └── preprocessor_config.json │ ├── requirements.txt │ ├── training │ ├── README.md │ ├── main.py │ └── training_scripts │ │ └── run_7b.sh │ └── utils │ ├── data │ ├── DST.py │ ├── __init__.py │ ├── aokvqa_dataset.py │ ├── builder.py │ ├── cc_sbu_align_dataset.py │ ├── coco_caption_dataset.py │ ├── dial_dataset.py │ ├── llava_dataset.py │ ├── llava_otter_blend_dataset.py │ ├── ocr_vqa_dataset.py │ ├── otter_mimicit_cgd_dataset.py │ ├── otter_mimicit_sd_dataset.py │ ├── otter_mimicit_sn_dataset.py │ ├── otter_mimicit_tvc_dataset.py │ ├── otter_mimicit_vst_dataset.py │ ├── sparkles_dialogue_dataset.py │ ├── utils.py │ └── vqa_dataset.py │ ├── ds_utils.py │ ├── model │ ├── __init__.py │ ├── modeling_dsvl.py │ ├── third_party_model │ │ ├── hf_model │ │ │ ├── configuration_llama.py │ │ │ └── modeling_llama.py │ │ └── qwen_clip │ │ │ └── qwen_clip.py │ └── vis_proj.py │ ├── module │ └── lora.py │ └── utils.py ├── benchmarks ├── README.md ├── communication │ ├── README.md │ ├── __init__.py │ ├── all_gather.py │ ├── all_reduce.py │ ├── all_to_all.py │ ├── broadcast.py │ ├── constants.py │ ├── pt2pt.py │ ├── run_all.py │ └── utils.py ├── deepcompile │ ├── .gitignore │ ├── README.md │ ├── configs │ │ ├── ddp_config.yaml.template │ │ ├── ds_config.json.template │ │ ├── ds_config.yaml.template │ │ ├── fsdp_config.yaml.template │ │ └── singlegpu_config.yaml.template │ ├── gen_chart_acc_steps.py │ ├── generate_conf.py │ ├── hostfile_n4 │ ├── plot.py │ ├── plot_common.py │ ├── results │ │ ├── acc_step_1 │ │ │ └── throughput │ │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs1.png │ │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs2.png │ │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs4.png │ │ │ │ ├── chart_throughput_Mixtral-8x7B_np32_bs1.png │ │ │ │ ├── chart_throughput_Mixtral-8x7B_np32_bs2.png │ │ │ │ └── chart_throughput_Mixtral-8x7B_np32_bs4.png │ │ └── acc_step_1_16 │ │ │ └── throughput │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs1.png │ │ │ └── chart_throughput_Mixtral-8x7B_np32_bs1.png │ ├── run.sh │ ├── run_bench.sh │ ├── run_bench_acc.sh │ ├── run_bench_lm.py │ ├── run_bench_offload.sh │ ├── run_bench_z1.sh │ └── run_multinode.sh └── inference │ ├── README.md │ ├── bert-bench.py │ ├── collect_results.py │ ├── deepspeedometer │ ├── README.md │ ├── configs │ │ ├── 128k-120.yaml │ │ ├── 1300-120.yaml │ │ ├── 2600-60.yaml │ │ └── 500-500.yaml │ ├── pyproject.toml │ ├── run_example.sh │ ├── src │ │ └── deepspeedometer │ │ │ ├── __init__.py │ │ │ ├── arg_parsing.py │ │ │ ├── benchmark_runner.py │ │ │ ├── clients │ │ │ ├── __init__.py │ │ │ ├── azure_ml_client.py │ │ │ ├── base.py │ │ │ ├── dummy_client.py │ │ │ ├── fastgen_client.py │ │ │ ├── openai_client.py │ │ │ └── vllm_client.py │ │ │ ├── config.py │ │ │ ├── prompt.py │ │ │ ├── response.py │ │ │ └── sample_input.py │ └── tests │ │ ├── README.md │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_benchmark.py │ │ ├── test_config.py │ │ ├── test_early_stop.py │ │ └── test_prompt.py │ ├── gpt-bench.py │ ├── mii │ ├── A6000_benchmarks_example.PNG │ ├── README.md │ ├── plot_config.yaml │ ├── requirements.txt │ ├── run_all.sh │ ├── run_aml.sh │ ├── run_benchmark.py │ ├── run_example.sh │ ├── run_fp6.sh │ └── src │ │ ├── __init__.py │ │ ├── client.py │ │ ├── defaults.py │ │ ├── plot_effective_throughput.py │ │ ├── plot_latency_percentile.py │ │ ├── plot_repl_scale.py │ │ ├── plot_th_lat.py │ │ ├── plot_tp_sizes.py │ │ ├── postprocess_results.py │ │ ├── random_query_generator.py │ │ ├── sample_input.py │ │ ├── server.py │ │ └── utils.py │ ├── requirements.txt │ ├── run_model.sh │ ├── run_triton_benchmark.sh │ ├── sweep.sh │ └── triton-bert-benchmark.py ├── compression ├── README.md ├── bert │ ├── README.md │ ├── bash_script │ │ ├── XTC │ │ │ ├── layer_reduction.sh │ │ │ ├── layer_reduction_1bit.sh │ │ │ └── quant_1bit.sh │ │ ├── ZeroQuant │ │ │ ├── zero_quant.sh │ │ │ └── zero_quant_lkd.sh │ │ ├── layer_reduction.sh │ │ ├── pruning_head.sh │ │ ├── pruning_row.sh │ │ ├── pruning_sparse.sh │ │ ├── pruning_sparse_snip_momentum.sh │ │ ├── quant_activation.sh │ │ └── quant_weight.sh │ ├── config │ │ ├── XTC │ │ │ ├── ds_config_W1A8_Qgroup1_fp32.json │ │ │ ├── ds_config_layer_reduction_W1Q8_fp32.json │ │ │ └── ds_config_layer_reduction_fp16.json │ │ ├── ZeroQuant │ │ │ ├── ds_config_W48A8_Qgroup48_lkd_fp32.json │ │ │ └── ds_config_W8A8_Qgroup48_fp32.json │ │ ├── ds_config.json │ │ ├── ds_config_TEMPLATE.json │ │ ├── ds_config_W1A8_Qgroup64_fp16.json │ │ ├── ds_config_W1A8_Qgroup64_fp32.json │ │ ├── ds_config_W1or2A8_Qgroup64_fp16.json │ │ └── ds_config_structural_pruning_TEMPLATE.json │ ├── huggingface_transformer │ │ └── modeling_bert.py │ ├── requirements.txt │ ├── run_glue_lkd.py │ ├── run_glue_no_trainer.py │ └── util.py ├── cifar │ ├── README.md │ ├── config │ │ ├── ds_config.json │ │ └── ds_config_channel_prune.json │ ├── resnet.py │ ├── run_compress.sh │ ├── train.py │ └── utils.py └── gpt2 │ ├── README.md │ ├── bash_script │ └── run_zero_quant.sh │ ├── config │ ├── ds_config.json │ ├── ds_config_W4or8A8_Qgroup64_fp16.json │ ├── ds_config_W4or8A8_Qgroup64_fp32.json │ ├── ds_config_W8A8_Qgroup64_fp16.json │ └── ds_config_W8A8_Qgroup64_fp32.json │ ├── requirements.txt │ └── run_clm_no_trainer.py ├── deepnvme ├── file_access │ ├── README.md │ ├── aio_load_cpu_tensor.py │ ├── aio_load_gpu_tensor.py │ ├── aio_store_cpu_tensor.py │ ├── aio_store_gpu_tensor.py │ ├── gds_load_gpu_tensor.py │ ├── gds_store_gpu_tensor.py │ ├── media │ │ └── deepnvme_ops_report.png │ ├── py_load_cpu_tensor.py │ ├── py_load_gpu_tensor.py │ ├── py_store_cpu_tensor.py │ ├── py_store_gpu_tensor.py │ ├── run_load_tensor.sh │ ├── run_store_tensor.sh │ └── utils.py ├── model_checkpoint │ ├── README.md │ ├── deepspeed_save_model.py │ ├── requirements.txt │ ├── save_model_utils.py │ ├── torch │ │ ├── serialization_fast_v2.6.0.py │ │ └── serialization_orig_v2.6.0.py │ ├── torch_save_model.py │ ├── torch_save_tensor.py │ └── torch_save_utils.py └── zero_inference │ ├── README.md │ └── media │ ├── nvme_config.png │ ├── zero_inf_mem_use_cpu.png │ └── zero_inf_mem_use_gds.png ├── evaluation └── inference │ └── human_eval │ ├── README.md │ └── run_human_eval.py ├── inference ├── huggingface │ ├── README.md │ ├── automatic-speech-recognition │ │ ├── README.md │ │ ├── requirements.txt │ │ └── test-wav2vec2.py │ ├── fill-mask │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── test-bert.py │ │ ├── test-electra.py │ │ └── test-roberta.py │ ├── stable-diffusion │ │ ├── README.md │ │ ├── local_pipeline_stable_diffusion.py │ │ ├── requirements.txt │ │ └── test-stable-diffusion.py │ ├── text-generation │ │ ├── README.md │ │ ├── arguments.py │ │ ├── ds-hf-compare.py │ │ ├── inference-test.py │ │ ├── requirements.txt │ │ ├── run-generation-script │ │ │ ├── README.md │ │ │ ├── requirements.txt │ │ │ ├── sample_query.txt │ │ │ ├── single_query.txt │ │ │ ├── test-gpt.sh │ │ │ └── test-run-generation.py │ │ └── utils.py │ ├── text2text-generation │ │ ├── README.md │ │ ├── requirements.txt │ │ └── test-t5.py │ ├── translation │ │ ├── README.md │ │ ├── requirements.txt │ │ └── test-t5-base.py │ └── zero_inference │ │ ├── README.md │ │ ├── images │ │ └── over_v1.png │ │ ├── model-support.md │ │ ├── requirements.txt │ │ ├── run_bloom175b_a6000.sh │ │ ├── run_llama2_70b_a6000.sh │ │ ├── run_model.py │ │ ├── run_model.sh │ │ ├── run_opt175b_a6000.sh │ │ ├── run_opt1p3b_a6000.sh │ │ ├── run_opt30b_a6000.sh │ │ ├── run_opt66b_a6000.sh │ │ ├── timer.py │ │ └── utils.py ├── mii │ ├── README.md │ ├── non-persistent │ │ ├── README.md │ │ ├── falcon.py │ │ ├── llama2.py │ │ ├── mixtral.py │ │ └── pipeline.py │ ├── persistent │ │ ├── README.md │ │ ├── client.py │ │ ├── serve.py │ │ └── terminate.py │ └── requirements.txt └── sglang │ ├── README.md │ ├── ds_offload_cpu.json │ ├── ds_offload_nvme_aio.json │ ├── ds_offload_nvme_gds.json │ ├── run_llama3_1B.sh │ ├── run_llama3_70B.sh │ └── run_llama3_8B.sh ├── scripts └── check-license.py └── training ├── BingBertGlue ├── glue_bert_base.json ├── glue_bert_large.json ├── nvidia │ ├── modeling.py │ ├── modelingpreln.py │ └── modelingpreln_layerdrop.py ├── nvidia_bert_dataset_provider.py ├── pytorch_pretrained_bert │ ├── __init__.py │ ├── __main__.py │ ├── convert_tf_checkpoint_to_pytorch.py │ ├── file_utils.py │ ├── modeling.py │ ├── optimization.py │ └── tokenization.py ├── run_glue_bert_base_finetune.sh ├── run_glue_bert_large_finetune.sh ├── run_glue_classifier_bert_base.py ├── run_glue_classifier_bert_large.py └── turing │ ├── dataset.py │ ├── file_utils.py │ ├── logger.py │ ├── loss.py │ ├── models.py │ ├── sources.py │ ├── text.py │ └── utils.py ├── BingBertSquad ├── 1-bit_adam │ ├── mpi_ethernet │ │ ├── deepspeed_onebitadam_bsz96_config.json │ │ ├── run_squad_deepspeed_onebitadam.sh │ │ └── run_squad_mpi_onebitadam.sh │ ├── mpi_infiniband │ │ ├── deepspeed_onebitadam_bsz96_config.json │ │ ├── run_squad_deepspeed_onebitadam.sh │ │ └── run_squad_mpi_onebitadam.sh │ └── nccl │ │ ├── deepspeed_onebitadam_bsz96_config.json │ │ └── run_squad_deepspeed_onebitadam.sh ├── NOTICE.txt ├── ckpt │ └── bert-large-uncased-whole-word-masking-config.json ├── convert_bert_ckpt_to_deepspeed.py ├── deepspeed_bsz24_config.json ├── evaluate-v1.1.py ├── evaluate.py ├── nvidia_run_squad_baseline.py ├── nvidia_run_squad_deepspeed.py ├── pytorch_pretrained_bert │ ├── __init__.py │ ├── file_utils.py │ ├── modeling.py │ ├── optimization.py │ └── tokenization.py ├── run_hf.sh ├── run_squad_baseline.sh ├── run_squad_deepspeed.sh ├── turing │ ├── file_utils.py │ ├── loss.py │ ├── modelingpreln_layerdrop.py │ ├── nvidia_modeling.py │ └── nvidia_modelingpreln.py └── utils.py ├── DeepSpeed-Domino ├── README.md ├── domino │ ├── gpt_model.py │ ├── language_model.py │ └── training.py ├── pretrain_gpt.py ├── pretrain_gpt3_13b.sh ├── pretrain_gpt3_6.7b.sh └── requirements.txt ├── HelloDeepSpeed ├── README.md ├── requirements.txt ├── run.sh ├── run_ds.sh ├── tests │ ├── __init__.py │ └── test_train_bert.py ├── train_bert.py └── train_bert_ds.py ├── MoQ ├── README.md ├── huggingface-transformers │ └── examples │ │ └── research_projects │ │ └── lxmert │ │ └── requirements.txt ├── requirements.txt ├── run.sh ├── run_glue.py └── test.json ├── autotuning ├── .gitignore ├── README.md └── hf │ ├── README.md │ ├── bert-base │ ├── README.md │ ├── ds_config_tune.json │ └── test_tune.sh │ ├── bert-large │ ├── README.md │ ├── ds_config_tune.json │ └── test_tune.sh │ ├── deberta │ ├── README.md │ ├── ds_config_fp16_tune.json │ └── test_tune.sh │ ├── distilbert │ ├── README.md │ ├── ds_config_tune.json │ └── test_tune.sh │ ├── dsconfigs │ ├── ds_config_fp16_tune.json │ ├── ds_config_fp16_z0.json │ ├── ds_config_fp16_z1.json │ ├── ds_config_fp16_z2.json │ ├── ds_config_fp16_z3.json │ ├── ds_config_tune.json │ ├── ds_config_z0.json │ ├── ds_config_z1.json │ ├── ds_config_z2.json │ └── ds_config_z3.json │ ├── gpt2-large │ ├── README.md │ └── test_tune.sh │ ├── gpt2-medium │ ├── README.md │ └── test_tune.sh │ ├── gpt2-xl │ ├── README.md │ └── test_tune.sh │ └── gpt2 │ ├── README.md │ └── test_tune.sh ├── bing_bert ├── 01_adam │ ├── mpi_ethernet │ │ ├── deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json │ │ ├── deepspeed_bsz4k_01adam_config_seq512_mpi_ethernet.json │ │ ├── ds_train_bert_01adam_bsz4k_seq128_mpi_ethernet.sh │ │ └── ds_train_bert_01adam_bsz4k_seq512_mpi_ethernet.sh │ ├── mpi_infiniband │ │ ├── deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json │ │ ├── deepspeed_bsz4k_01adam_config_seq512_mpi_infiniband.json │ │ ├── ds_train_bert_01adam_bsz4k_seq128_mpi_infiniband.sh │ │ └── ds_train_bert_01adam_bsz4k_seq512_mpi_infiniband.sh │ └── nccl │ │ ├── deepspeed_bsz4k_01adam_config_seq128_nccl.json │ │ ├── deepspeed_bsz4k_01adam_config_seq512_nccl.json │ │ ├── ds_train_bert_01adam_bsz4k_seq128_nccl.sh │ │ └── ds_train_bert_01adam_bsz4k_seq512_nccl.sh ├── 1-bit_adam │ ├── mpi_ethernet │ │ ├── deepspeed_bsz4k_onebitadam_config_seq128_mpi_ethernet.json │ │ ├── ds_train_bert_onebitadam_bsz4k_seq128_mpi_ethernet.sh │ │ └── mpi_train_bert_onebitadam_bsz4k_seq128_ethernet.sh │ ├── mpi_infiniband │ │ ├── deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json │ │ ├── ds_train_bert_onebitadam_bsz4k_seq128_mpi_infiniband.sh │ │ └── mpi_train_bert_onebitadam_bsz4k_seq128_infiniband.sh │ └── nccl │ │ ├── deepspeed_bsz4k_onebitadam_config_seq128_nccl.json │ │ └── ds_train_bert_onebitadam_bsz4k_seq128_nccl.sh ├── 1-bit_lamb │ ├── mpi_ethernet │ │ ├── deepspeed_bsz32k_onebitlamb_config_seq512_mpi_ethernet.json │ │ ├── deepspeed_bsz64k_onebitlamb_config_seq128_mpi_ethernet.json │ │ ├── ds_train_bert_onebitlamb_bsz32k_seq512_mpi_ethernet.sh │ │ ├── ds_train_bert_onebitlamb_bsz64k_seq128_mpi_ethernet.sh │ │ ├── mpi_train_bert_onebitlamb_bsz32k_seq512_ethernet.sh │ │ └── mpi_train_bert_onebitlamb_bsz64k_seq128_ethernet.sh │ ├── mpi_infiniband │ │ ├── deepspeed_bsz32k_onebitlamb_config_seq512_mpi_infiniband.json │ │ ├── deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json │ │ ├── ds_train_bert_onebitlamb_bsz32k_seq512_mpi_infiniband.sh │ │ ├── ds_train_bert_onebitlamb_bsz64k_seq128_mpi_infiniband.sh │ │ ├── mpi_train_bert_onebitlamb_bsz32k_seq512_infiniband.sh │ │ └── mpi_train_bert_onebitlamb_bsz64k_seq128_infiniband.sh │ └── nccl │ │ ├── deepspeed_bsz32k_onebitlamb_config_seq512_nccl.json │ │ ├── deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json │ │ ├── ds_train_bert_onebitlamb_bsz32k_seq512_nccl.sh │ │ └── ds_train_bert_onebitlamb_bsz64k_seq128_nccl.sh ├── NOTICE.txt ├── README.md ├── bert_base.json ├── bert_base_large_lr.json ├── bert_dataset_provider.py ├── bert_large.json ├── bert_large_lamb.json ├── bert_large_lamb_nvidia_data.json ├── bing_bert_dataset_provider.py ├── data_worker.py ├── deepspeed_bsz32k_lamb_config_seq512.json ├── deepspeed_bsz4k_progressive_layer_drop_config_seq128.json ├── deepspeed_bsz64k_lamb_config_seq128.json ├── deepspeed_train.py ├── ds_sa_train_bert_bsz64k_seq128.sh ├── ds_train_bert_bsz32k_seq512.sh ├── ds_train_bert_bsz64k_seq128.sh ├── ds_train_bert_nvidia_data_bsz32k_seq512.sh ├── ds_train_bert_nvidia_data_bsz64k_seq128.sh ├── ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh ├── glue_bert_base.json ├── glue_bert_large.json ├── nvidia │ ├── modelingpreln.py │ └── modelingpreln_layerdrop.py ├── nvidia_bert_dataset_provider.py ├── pytorch_pretrained_bert │ ├── __init__.py │ ├── __main__.py │ ├── convert_tf_checkpoint_to_pytorch.py │ ├── file_utils.py │ ├── modeling.py │ ├── optimization.py │ └── tokenization.py ├── requirements.txt ├── run_glue_bert_base_finetune.sh ├── run_glue_bert_large_finetune.sh ├── run_glue_classifier_bert_base.py ├── run_glue_classifier_bert_large.py ├── timer.py ├── turing │ ├── dataset.py │ ├── file_utils.py │ ├── logger.py │ ├── loss.py │ ├── models.py │ ├── sources.py │ ├── text.py │ └── utils.py └── utils.py ├── cifar ├── LICENSE ├── NOTICE.txt ├── README.md ├── cifar10_deepspeed.py ├── cifar10_tutorial.py ├── requirements.txt ├── run_ds.sh ├── run_ds_moe.sh └── run_ds_prmoe.sh ├── data_efficiency ├── gpt_finetuning │ ├── README.md │ ├── analyze_data.py │ ├── bash_script │ │ ├── run_base_random_ltd.sh │ │ └── run_medium_random_ltd.sh │ ├── config │ │ ├── ds_config_gpt_base_random_ltd.json │ │ └── ds_config_gpt_medium_random_ltd.json │ ├── finetune │ │ ├── ds_analyze_gpt_data_map.sh │ │ ├── ds_analyze_gpt_data_reduce.sh │ │ ├── ds_config_gpt2-medium_1clmetric_TEMPLATE.json │ │ ├── ds_config_gpt2-medium_2clmetrics_TEMPLATE.json │ │ ├── ds_config_gpt2_TEMPLATE.json │ │ ├── ds_finetune_gpt2.sh │ │ └── ds_finetune_gpt2_run.sh │ ├── learning_rates.py │ ├── requirement.txt │ └── run_clm_no_trainer.py ├── variable_batch_size_and_lr │ ├── README.md │ ├── variable_attn_matrix.png │ ├── variable_batch_lr.png │ ├── variable_batch_lr_pipeline.png │ └── variable_batch_size_and_lr_example.py └── vit_finetuning │ ├── README.md │ ├── bash_script │ ├── run_cifar_random_ltd.sh │ └── run_imagenet_random_ltd.sh │ ├── config │ ├── ds_config_cifar_random_ltd.json │ └── ds_config_imagenet_random_ltd.json │ ├── main_cifar.py │ ├── main_imagenet.py │ ├── models │ ├── __init__.py │ └── vit.py │ ├── requirement.txt │ └── utils │ ├── __init__.py │ ├── get_data.py │ └── utils.py ├── gan ├── gan_baseline_run.sh ├── gan_baseline_train.py ├── gan_deepspeed_config.json ├── gan_deepspeed_run.sh ├── gan_deepspeed_train.py ├── gan_model.py └── utils.py ├── imagenet ├── README.md ├── assets │ └── resnetplot.png ├── config │ ├── ds_config.json │ ├── ds_fp16_config.json │ └── ds_fp16_z1_config.json ├── extract_ILSVRC.sh ├── main.py ├── requirements.txt ├── run_ds.sh ├── run_ds_fp16.sh └── run_ds_fp16_z1.sh ├── megatron └── README.md ├── offload_states ├── README.md ├── offload_states.py ├── output_table.py └── run_benchmark.sh ├── pipeline_parallelism ├── alexnet.py ├── ds_config.json ├── run.sh └── train.py ├── stable_diffusion ├── README.md ├── inf_txt2img_loop.py ├── local_pipeline_stable_diffusion.py ├── mytrainbash.sh ├── requirements.txt └── train_sd_distil_lora.py └── tensor_parallel ├── README.md ├── alpaca_data.json ├── configs ├── ds_config.json └── ds_config_temp.json ├── requirements.txt ├── run.sh ├── train.py ├── train_bench_length.py └── utils.py /.github/workflows/formatting.yml: -------------------------------------------------------------------------------- 1 | name: Formatting 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'staging**' 7 | pull_request: 8 | branches: 9 | '**' 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | 17 | # formatting and basic install on cpu-only machine 18 | formatting: 19 | runs-on: ubuntu-22.04 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - name: environment 25 | run: | 26 | which python 27 | python --version 28 | pip install pre-commit>=2.20.0 29 | 30 | - name: Formatting checks 31 | run: | 32 | pre-commit run --all-files 33 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "training/DeepSpeed-Domino/Megatron-LM"] 2 | path = training/DeepSpeed-Domino/Megatron-LM 3 | url = https://github.com/NVIDIA/Megatron-LM.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v1.2.3 4 | hooks: 5 | - id: trailing-whitespace 6 | exclude: "Megatron-LM/" 7 | files: ^applications/DeepSpeed-Chat/.+ 8 | - id: check-yaml 9 | exclude: "Megatron-LM/" 10 | files: ^applications/DeepSpeed-Chat/.+ 11 | - id: end-of-file-fixer 12 | exclude: "Megatron-LM/" 13 | files: ^applications/DeepSpeed-Chat/.+ 14 | 15 | 16 | - repo: https://github.com/google/yapf 17 | rev: v0.32.0 18 | hooks: 19 | - id: yapf 20 | files: ^applications/DeepSpeed-Chat/.+ 21 | 22 | - repo: https://github.com/pycqa/flake8 23 | rev: 4.0.1 24 | hooks: 25 | - id: flake8 26 | args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401'] 27 | files: ^applications/DeepSpeed-Chat/.+ 28 | 29 | - repo: local 30 | hooks: 31 | - id: check-license 32 | name: check-license 33 | entry: ./scripts/check-license.py 34 | language: script 35 | files: ^applications/DeepSpeed-Chat/.+\.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr|sh)$ 36 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @tjruwase @ShadenSmith @awan-10 @minjiaz 2 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/1.3B-breakdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/1.3B-breakdown.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/Banner-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/Banner-benchmark.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/RLHF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/RLHF.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/democrat2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/democrat2.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/ds-chat-single.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-chat-single.gif -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/ds-chat.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-chat.gif -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/ds-shiba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-shiba.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/e2e_RLHF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/e2e_RLHF.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/four_blocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/four_blocks.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/ppo_trainer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ppo_trainer.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/reward_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/reward_function.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/image/shiba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/shiba.png -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/assets/video/release_v3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/video/release_v3.mp4 -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/chat.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import argparse 7 | import subprocess 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--path", 12 | type=str, 13 | help="Directory containing trained actor model") 14 | parser.add_argument( 15 | "--max_new_tokens", 16 | type=int, 17 | default=128, 18 | help="Maximum new tokens to generate per response", 19 | ) 20 | args = parser.parse_args() 21 | 22 | cmd = f"python3 ./inference/chatbot.py --path {args.path} --max_new_tokens {args.max_new_tokens}" 23 | p = subprocess.Popen(cmd, shell=True) 24 | p.wait() 25 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets>=2.8.0 2 | sentencepiece>=0.1.97 3 | protobuf==3.20.3 4 | accelerate>=0.15.0 5 | torch>=1.12.0 6 | deepspeed>=0.9.0 7 | transformers>=4.31.0,!=4.33.2 8 | tensorboard 9 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # setup.py: install script for deepspeed_chat 8 | """ 9 | to install deepspeed_chat and its dependencies for development work, 10 | run this cmd from the root directory: 11 | pip install -e . 12 | """ 13 | import setuptools 14 | 15 | setuptools.setup( 16 | name="deepspeed-chat", 17 | version="0.1", 18 | url= 19 | "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat", 20 | include_package_data=True, 21 | packages=setuptools.find_packages(include=['dschat']), 22 | install_requires=[ 23 | "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3", 24 | "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2", 25 | "transformers>=4.31.0,!=4.33.2", "tensorboard" 26 | ], 27 | extras_require={ 28 | "azureml": [ 29 | "azure-ml-component", 30 | "azureml-core", 31 | ], 32 | }) 33 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # You can provide two models to compare the performance of the baseline and the finetuned model 8 | export CUDA_VISIBLE_DEVICES=0 9 | python prompt_eval.py \ 10 | --model_name_or_path_baseline XXX \ 11 | --model_name_or_path_finetune XXX 12 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/README.md: -------------------------------------------------------------------------------- 1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family. 2 | 3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace 4 | `` --model_name_or_path facebook/opt-1.3b`` to ``--model_name_or_path EleutherAI/gpt-j-6b ``. 5 | 6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-) 7 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output_step1_llama2_7b 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0. \ 25 | --num_train_epochs 4 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --output_dir $OUTPUT \ 34 | &> $OUTPUT/training.log 35 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output_step1_llama2_7b_lora 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0. \ 25 | --num_train_epochs 4 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --lora_dim 128 \ 34 | --lora_module_name "layers." \ 35 | --output_dir $OUTPUT \ 36 | &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/multi_node/run_66b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-66b \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 1e-4 \ 24 | --weight_decay 0.1 \ 25 | --num_train_epochs 2 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --lora_dim 128 \ 33 | --lora_module_name decoder.layers. \ 34 | --deepspeed \ 35 | --output_dir $OUTPUT \ 36 | &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # Note that usually LoRA needs to use larger learning rate 8 | OUTPUT=$1 9 | ZERO_STAGE=$2 10 | if [ "$OUTPUT" == "" ]; then 11 | OUTPUT=./output 12 | fi 13 | if [ "$ZERO_STAGE" == "" ]; then 14 | ZERO_STAGE=0 15 | fi 16 | mkdir -p $OUTPUT 17 | 18 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-1.3b \ 19 | --gradient_accumulation_steps 8 --lora_dim 128 --zero_stage $ZERO_STAGE \ 20 | --enable_tensorboard \ 21 | --tensorboard_path $OUTPUT \ 22 | --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log 23 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # Note that usually LoRA needs to use larger learning rate 8 | OUTPUT_PATH=./output 9 | mkdir -p $OUTPUT_PATH 10 | 11 | deepspeed --num_gpus 1 main.py \ 12 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 13 | --data_split 2,4,4 \ 14 | --model_name_or_path facebook/opt-6.7b \ 15 | --per_device_train_batch_size 8 \ 16 | --per_device_eval_batch_size 8 \ 17 | --max_seq_len 512 \ 18 | --learning_rate 1e-3 \ 19 | --weight_decay 0. \ 20 | --num_train_epochs 16 \ 21 | --gradient_accumulation_steps 16 \ 22 | --lr_scheduler_type cosine \ 23 | --num_warmup_steps 0 \ 24 | --seed 1234 \ 25 | --gradient_checkpointing \ 26 | --zero_stage 0 \ 27 | --lora_dim 128 \ 28 | --lora_module_name decoder.layers. \ 29 | --deepspeed \ 30 | --output_dir $OUTPUT_PATH \ 31 | &> $OUTPUT_PATH/training.log 32 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=2 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-1.3b \ 20 | --per_device_train_batch_size 8 \ 21 | --per_device_eval_batch_size 8 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0. \ 25 | --num_train_epochs 16 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --zero_stage $ZERO_STAGE \ 31 | --deepspeed \ 32 | --enable_tensorboard \ 33 | --tensorboard_path $OUTPUT \ 34 | --output_dir $OUTPUT \ 35 | &> $OUTPUT/training.log 36 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # Note that usually LoRA needs to use larger learning rate 8 | OUTPUT_PATH=./output 9 | mkdir -p $OUTPUT_PATH 10 | 11 | deepspeed main.py \ 12 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 13 | --data_split 2,4,4 \ 14 | --model_name_or_path facebook/opt-1.3b \ 15 | --per_device_train_batch_size 8 \ 16 | --per_device_eval_batch_size 8 \ 17 | --max_seq_len 512 \ 18 | --learning_rate 1e-3 \ 19 | --weight_decay 0.1 \ 20 | --num_train_epochs 16 \ 21 | --gradient_accumulation_steps 1 \ 22 | --lr_scheduler_type cosine \ 23 | --num_warmup_steps 0 \ 24 | --seed 1234 \ 25 | --zero_stage 0 \ 26 | --lora_dim 128 \ 27 | --lora_module_name decoder.layers. \ 28 | --only_optimize_lora \ 29 | --deepspeed \ 30 | --output_dir $OUTPUT_PATH \ 31 | &> $OUTPUT_PATH/training.log 32 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_13b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-13b \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 1e-4 \ 24 | --weight_decay 0. \ 25 | --num_train_epochs 16 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --lora_dim 128 \ 33 | --lora_module_name decoder.layers. \ 34 | --deepspeed \ 35 | --output_dir $OUTPUT \ 36 | &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_30b_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT_PATH=./output 7 | mkdir -p $OUTPUT_PATH 8 | 9 | deepspeed main.py \ 10 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 11 | --data_split 2,4,4 \ 12 | --model_name_or_path facebook/opt-30b \ 13 | --per_device_train_batch_size 4 \ 14 | --per_device_eval_batch_size 4 \ 15 | --max_seq_len 512 \ 16 | --learning_rate 9.65e-6 \ 17 | --weight_decay 0. \ 18 | --num_train_epochs 16 \ 19 | --gradient_accumulation_steps 1 \ 20 | --lr_scheduler_type cosine \ 21 | --num_warmup_steps 0 \ 22 | --seed 1234 \ 23 | --lora_dim 128 \ 24 | --gradient_checkpointing \ 25 | --zero_stage 3 \ 26 | --deepspeed \ 27 | --output_dir $OUTPUT_PATH \ 28 | &> $OUTPUT_PATH/training.log 29 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_6.7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-6.7b \ 20 | --per_device_train_batch_size 6 \ 21 | --per_device_eval_batch_size 6 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0. \ 25 | --num_train_epochs 16 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --output_dir $OUTPUT \ 34 | &> $OUTPUT/training.log 35 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Characterization Script 2 | 3 | # Contents 4 | * [Introduction](#introduction) 5 | * [Usage](#usage) 6 | 7 | # Introduction 8 | The step 1 characterization script sweeps across various training parameters. Currently, the following parameters are swept: 9 |
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | Lora: True, False
13 | 
14 | 15 | The `run_step1_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc). 16 | 17 | # Usage 18 | The sweep script can be run as follows: 19 |
20 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning$ bash training_scripts/opt/single_node/sweep/run_step1_sweep.sh
21 | 
22 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | for z in {2..3} 7 | do 8 | for offload in true false 9 | do 10 | for lora in true false 11 | do 12 | cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \ 13 | ${z} \ 14 | ${offload} \ 15 | ${lora} \ 16 | z${z}_offload_${offload}_lora_${lora}" 17 | echo "----------------------------- CALLING SHELL SCRIPT -----------------------------" 18 | echo $cmd 19 | $cmd 20 | pkill -9 python 21 | sleep 60 22 | echo "" 23 | done 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/other_language/run_chinese.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=2 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | # The Chinese data we found mostly only contain one response without another 17 | # "rejected" response. Thus we only test the step 1 finetuning and use 18 | # a data_split of 10,0,0 (keep all data for step 1). 19 | deepspeed main.py \ 20 | --data_path wangrui6/Zhihu-KOL Cohere/miracl-zh-queries-22-12 Hello-SimpleAI/HC3-Chinese mkqa-Chinese \ 21 | --data_split 10,0,0 \ 22 | --model_name_or_path bigscience/bloom-1b1 \ 23 | --per_device_train_batch_size 8 \ 24 | --per_device_eval_batch_size 8 \ 25 | --max_seq_len 512 \ 26 | --learning_rate 9.65e-6 \ 27 | --weight_decay 0. \ 28 | --num_train_epochs 16 \ 29 | --gradient_accumulation_steps 1 \ 30 | --lr_scheduler_type cosine \ 31 | --num_warmup_steps 0 \ 32 | --seed 1234 \ 33 | --zero_stage $ZERO_STAGE \ 34 | --deepspeed \ 35 | --output_dir $OUTPUT \ 36 | &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/other_language/run_japanese.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=2 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | # The Japanese data we found mostly only contain one response without another 17 | # "rejected" response. Thus we only test the step 1 finetuning and use 18 | # a data_split of 10,0,0 (keep all data for step 1). 19 | deepspeed main.py \ 20 | --data_path mkqa-Japanese Cohere/miracl-ja-queries-22-12 lmqg/qg_jaquad lmqg/qag_jaquad \ 21 | --data_split 10,0,0 \ 22 | --model_name_or_path sberbank-ai/mGPT \ 23 | --per_device_train_batch_size 8 \ 24 | --per_device_eval_batch_size 8 \ 25 | --max_seq_len 512 \ 26 | --learning_rate 9.65e-6 \ 27 | --weight_decay 0. \ 28 | --num_train_epochs 16 \ 29 | --gradient_accumulation_steps 1 \ 30 | --lr_scheduler_type cosine \ 31 | --num_warmup_steps 0 \ 32 | --seed 1234 \ 33 | --zero_stage $ZERO_STAGE \ 34 | --deepspeed \ 35 | --output_dir $OUTPUT \ 36 | &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md: -------------------------------------------------------------------------------- 1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family. 2 | 3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace 4 | `` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``. 5 | 6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-) 7 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | --per_device_train_batch_size 8 \ 21 | --per_device_eval_batch_size 8 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0.1 \ 25 | --num_train_epochs 1 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --offload \ 34 | --output_dir $OUTPUT \ 35 | &> $OUTPUT/training.log 36 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | --per_device_train_batch_size 8 \ 21 | --per_device_eval_batch_size 8 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0.1 \ 25 | --num_train_epochs 1 \ 26 | --gradient_accumulation_steps 1 \ 27 | --lr_scheduler_type cosine \ 28 | --num_warmup_steps 0 \ 29 | --seed 1234 \ 30 | --gradient_checkpointing \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --offload \ 34 | --lora_dim 128 \ 35 | --lora_module_name "layers." \ 36 | --output_dir $OUTPUT \ 37 | &> $OUTPUT/training.log 38 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=0 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-350m \ 20 | --per_device_train_batch_size 2 \ 21 | --per_device_eval_batch_size 2 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 5e-5 \ 24 | --weight_decay 0.1 \ 25 | --dropout 0.0 \ 26 | --num_train_epochs 1 \ 27 | --gradient_accumulation_steps 1 \ 28 | --lr_scheduler_type cosine \ 29 | --num_warmup_steps 0 \ 30 | --seed 1234 \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --output_dir $OUTPUT \ 34 | &> $OUTPUT/training.log 35 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=0 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \ 17 | --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \ 18 | --enable_tensorboard \ 19 | --tensorboard_path $OUTPUT \ 20 | --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log 21 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=0 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-350m \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 5e-5 \ 24 | --weight_decay 0.1 \ 25 | --num_train_epochs 1 \ 26 | --dropout 0.0 \ 27 | --gradient_accumulation_steps 1 \ 28 | --lr_scheduler_type cosine \ 29 | --num_warmup_steps 0 \ 30 | --seed 1234 \ 31 | --zero_stage $ZERO_STAGE \ 32 | --deepspeed \ 33 | --output_dir $OUTPUT \ 34 | &> $OUTPUT/training.log 35 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Characterization Script 2 | 3 | # Contents 4 | * [Introduction](#introduction) 5 | * [Usage](#usage) 6 | 7 | # Introduction 8 | The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept: 9 |
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | 
13 | 14 | The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc). 15 | 16 | # Usage 17 | The sweep script can be run as follows: 18 |
19 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
20 | 
21 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | for z in {2..3} 7 | do 8 | for offload in true false 9 | do 10 | cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \ 11 | ${z} \ 12 | ${offload} \ 13 | z${z}_offload_${offload}" 14 | echo "----------------------------- CALLING SHELL SCRIPT -----------------------------" 15 | echo $cmd 16 | $cmd 17 | pkill -9 python 18 | sleep 60 19 | echo "" 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/evaluation_scripts/run_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # Add the path to the finetuned model 8 | python rw_eval.py \ 9 | --model_name_or_path 10 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/README.md: -------------------------------------------------------------------------------- 1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family. 2 | 3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace 4 | `` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``. 5 | 6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-) 7 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | --per_device_train_batch_size 8 \ 21 | --per_device_eval_batch_size 8 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0.1 \ 25 | --num_padding_at_beginning 0 \ 26 | --num_train_epochs 1 \ 27 | --gradient_accumulation_steps 1 \ 28 | --lr_scheduler_type cosine \ 29 | --num_warmup_steps 0 \ 30 | --seed 1234 \ 31 | --gradient_checkpointing \ 32 | --zero_stage $ZERO_STAGE \ 33 | --deepspeed \ 34 | --offload \ 35 | --output_dir $OUTPUT \ 36 | &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=3 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | --per_device_train_batch_size 8 \ 21 | --per_device_eval_batch_size 8 \ 22 | --max_seq_len 512 \ 23 | --learning_rate 9.65e-6 \ 24 | --weight_decay 0.1 \ 25 | --num_padding_at_beginning 0 \ 26 | --num_train_epochs 1 \ 27 | --gradient_accumulation_steps 1 \ 28 | --lr_scheduler_type cosine \ 29 | --num_warmup_steps 0 \ 30 | --seed 1234 \ 31 | --gradient_checkpointing \ 32 | --zero_stage $ZERO_STAGE \ 33 | --deepspeed \ 34 | --offload \ 35 | --lora_dim 128 \ 36 | --lora_module_name "layers." \ 37 | --output_dir $OUTPUT \ 38 | &> $OUTPUT/training.log 39 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/multi_node/run_350m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=0 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-350m \ 20 | --num_padding_at_beginning 1 \ 21 | --per_device_train_batch_size 2 \ 22 | --per_device_eval_batch_size 2 \ 23 | --max_seq_len 512 \ 24 | --learning_rate 5e-5 \ 25 | --weight_decay 0.1 \ 26 | --dropout 0.0 \ 27 | --num_train_epochs 1 \ 28 | --gradient_accumulation_steps 1 \ 29 | --lr_scheduler_type cosine \ 30 | --num_warmup_steps 0 \ 31 | --seed 1234 \ 32 | --zero_stage $ZERO_STAGE \ 33 | --deepspeed \ 34 | --output_dir $OUTPUT \ 35 | &> $OUTPUT/training.log 36 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=0 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \ 17 | --num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \ 18 | --enable_tensorboard \ 19 | --tensorboard_path $OUTPUT \ 20 | --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log 21 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | if [ "$OUTPUT" == "" ]; then 9 | OUTPUT=./output 10 | fi 11 | if [ "$ZERO_STAGE" == "" ]; then 12 | ZERO_STAGE=0 13 | fi 14 | mkdir -p $OUTPUT 15 | 16 | deepspeed main.py \ 17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ 18 | --data_split 2,4,4 \ 19 | --model_name_or_path facebook/opt-350m \ 20 | --num_padding_at_beginning 1 \ 21 | --per_device_train_batch_size 4 \ 22 | --per_device_eval_batch_size 4 \ 23 | --max_seq_len 512 \ 24 | --learning_rate 5e-5 \ 25 | --weight_decay 0.1 \ 26 | --num_train_epochs 1 \ 27 | --dropout 0.0 \ 28 | --gradient_accumulation_steps 1 \ 29 | --lr_scheduler_type cosine \ 30 | --num_warmup_steps 0 \ 31 | --seed 1234 \ 32 | --zero_stage $ZERO_STAGE \ 33 | --deepspeed \ 34 | --output_dir $OUTPUT \ 35 | &> $OUTPUT/training.log 36 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Characterization Script 2 | 3 | # Contents 4 | * [Introduction](#introduction) 5 | * [Usage](#usage) 6 | 7 | # Introduction 8 | The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept: 9 |
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | 
13 | 14 | The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc). 15 | 16 | # Usage 17 | The sweep script can be run as follows: 18 |
19 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
20 | 
21 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | for z in {2..3} 7 | do 8 | for offload in true false 9 | do 10 | cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \ 11 | ${z} \ 12 | ${offload} \ 13 | z${z}_offload_${offload}" 14 | echo "----------------------------- CALLING SHELL SCRIPT -----------------------------" 15 | echo $cmd 16 | $cmd 17 | pkill -9 python 18 | sleep 60 19 | echo "" 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/README.md: -------------------------------------------------------------------------------- 1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family. 2 | 3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply update 4 | ``` --actor_model_name_or_path ${step1_path} --critic_model_name_or_path ${step2_path} ```. 5 | 6 | If you don't have step 1 and step 2 models. You may simply try 7 | ``` bash 8 | --actor_model_name_or_path facebook/opt-1.3b --critic_model_name_or_path facebook/opt-350m 9 | ``` 10 | ⚡⚡⚡ When you use above script, please make sure you modify parameter `rlhf_training` to False when calling the `create_critic_model` function twice in [rlhf_engine.py](./../../step3_rlhf_finetuning/rlhf_engine.py) such that it won't load the model weight from previous paths. 11 | 12 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-) 13 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | ACTOR_MODEL_PATH=$1 7 | CRITIC_MODEL_PATH=$2 8 | ACTOR_ZERO_STAGE=$3 9 | CRITIC_ZERO_STAGE=$4 10 | OUTPUT=$5 11 | if [ "$OUTPUT" == "" ]; then 12 | OUTPUT=./output 13 | fi 14 | if [ "$ACTOR_ZERO_STAGE" == "" ]; then 15 | ACTOR_ZERO_STAGE=0 16 | fi 17 | if [ "$CRITIC_ZERO_STAGE" == "" ]; then 18 | CRITIC_ZERO_STAGE=0 19 | fi 20 | mkdir -p $OUTPUT 21 | 22 | deepspeed --num_gpus 1 main.py \ 23 | --actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \ 24 | --actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \ 25 | --num_padding_at_beginning 1 --gradient_accumulation_steps 2 \ 26 | --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0 \ 27 | --output_dir $OUTPUT &> $OUTPUT/training.log 28 | -------------------------------------------------------------------------------- /applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Characterization Script 2 | 3 | # Contents 4 | * [Introduction](#introduction) 5 | * [Usage](#usage) 6 | 7 | # Introduction 8 | The step 3 characterization script sweeps across various training parameters. Currently, the following parameters are swept: 9 |
10 | Zero Stage: 2, 3
11 | Hybrid Engine: True, False
12 | Offload: True, False
13 | Lora: True, False
14 | 
15 | 16 | The `run_step3_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc). 17 | 18 | # Usage 19 | The sweep script can be run as follows: 20 |
21 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/opt/single_node/sweep/run_step3_sweep.sh
22 | 
23 | -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/banner.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/assets/ceos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/ceos.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/assets/friends.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/friends.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/assets/hero-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/hero-figure.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/assets/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/model.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/chat/README.md: -------------------------------------------------------------------------------- 1 | We provide a CLI interface for uses to test their trained chat model. First of all, please note that you need to provide both the trained checkpoint and the original language model & vision encoder paths. The model is first initialized and then loads the trained checkpoint. Also, please note that if you used multi-modal causal attention during your training, remember to put --enable_mmca_attention for in your chat script. -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/chat/chat_scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | MAIN_PATH=$1 7 | 8 | VISION_ENCODER=/blob/transformers_cache/qwen-clip 9 | LLM=/blob/transformers_cache/Llama-2-13b-hf 10 | 11 | export CUDA_VISIBLE_DEVICES=0 # Do multi single evaluation 12 | # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # Do multi gpu evaluation for large models (single GPU is not enough) 13 | 14 | 15 | python chat.py \ 16 | --lm_model_name_or_path $LLM \ 17 | --vision_model_name_or_path $VISION_ENCODER \ 18 | --checkpoint_path $MAIN_PATH --enable_mmca_attention 19 | -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/eval_single.json: -------------------------------------------------------------------------------- 1 | { 2 | "cat_images1": [["please describe the image", "./eval/eval_data/images/cats/cat.png"]], 3 | "cat_images2": [["can you describe the image", "./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg"]], 4 | "cat_images3": [["please describe the image", "./eval/eval_data/images/cats/british_shorthair.jpg"]], 5 | "extreme_ironing": [["What is unusual about this image?", "./eval/eval_data/images/singles/extreme_ironing.jpg"]], 6 | "waterview": [["What are the things I should be cautious about when I visit here?", "./eval/eval_data/images/singles/waterview.jpg"]], 7 | "art-dog": [["can you describe the image", "./eval/eval_data/images/singles/202160027_b319c4166e.jpg"]], 8 | "funny-phone": [["What is funny about this image? Describe it panel by panel.", "./eval/eval_data/images/singles/1.jpg"]], 9 | "squirrel": [["Why would a person find this image funny?", "./eval/eval_data/images/singles/2.jpg"]], 10 | "art-painting": [["Tell me about this work of art.", "./eval/eval_data/images/singles/50.jpg"]] 11 | } 12 | -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/eval/eval_scripts/run_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # DeepSpeed Team 5 | 6 | #EVAL_DATSET=eval_robustness eval_single eval_comprehensive (see the json in the folder ./eval_data/*.json) 7 | MAIN_PATH=$1 8 | VISION_MODEL=/blob/transformers_cache/qwen-clip #openai/clip-vit-large-patch14 9 | LLM=/blob/transformers_cache/Llama-2-13b-hf #meta-llama/Llama-2-7b 10 | for EVAL_DATSET in eval_single eval_comprehensive eval_robustness 11 | do 12 | SAVE_PATH=eval/results/${EVAL_DATSET} 13 | mkdir ${SAVE_PATH} 14 | for CKPT_NAME in final best_eval 15 | do 16 | #NOTE: to run multi-GPU, you simple do "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7;" 17 | export CUDA_VISIBLE_DEVICES=0; python eval/batch_generation.py --model_name dsvl --vis_proj baseline --max_seq_len 4096 \ 18 | --lm_model_name_or_path ${LLM} --vision_model_name_or_path ${VISION_MODEL} \ 19 | --checkpoint_path $MAIN_PATH --checkpoint_names $CKPT_NAME --eval_data ${EVAL_DATSET} \ 20 | --enable_mmca_attention --output_filename ${SAVE_PATH}/ours_${CKPT_NAME} &> ${SAVE_PATH}/ours_${CKPT_NAME}.log 21 | done 22 | done 23 | 24 | -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/helper/README.md: -------------------------------------------------------------------------------- 1 | # QWen-VL's Vision Encoder 2 | The extract_qwen_vl.py can be used to extract the vision encoder from QWen-VL. After extraction, you can find other necessary files in the [folder](./qwen_clip). -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/helper/extract_qwen_vl.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM 2 | import torch 3 | 4 | PATH = "Qwen/Qwen-VL-Chat" 5 | 6 | model = AutoModelForCausalLM.from_pretrained(PATH, device_map="cuda", trust_remote_code=True).eval() 7 | 8 | state_dict = model.state_dict() 9 | save_dict = {} 10 | for k,v in state_dict.items(): 11 | if 'visual' in k: 12 | if 'transformer.visual.proj' not in k: # we don't need the proj layer 13 | save_dict[k.replace('transformer.visual.', '')] = v 14 | torch.save(save_dict, './qwen_clip/pytorch_model.bin') -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/helper/qwen_clip/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "crop_size": 448, 3 | "do_center_crop": true, 4 | "do_normalize": true, 5 | "do_resize": true, 6 | "feature_extractor_type": "CLIPFeatureExtractor", 7 | "image_mean": [ 8 | 0.48145466, 9 | 0.4578275, 10 | 0.40821073 11 | ], 12 | "image_std": [ 13 | 0.26862954, 14 | 0.26130258, 15 | 0.27577711 16 | ], 17 | "resample": 3, 18 | "size": 448 19 | } 20 | -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets>=2.8.0 2 | sentencepiece>=0.1.97 3 | protobuf==3.20.3 4 | accelerate>=0.15.0 5 | open_clip_torch 6 | deepspeed>=0.10.3 7 | einops 8 | einops_exts 9 | transformers==4.33.3 10 | transformers_stream_generator 11 | termcolor -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/utils/data/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is adapted from https://github.com/open-mmlab/Multimodal-GPT 2 | 3 | from .builder import build_dataset # noqa: F401 4 | from .vqa_dataset import VQADataset # noqa: F401 5 | from .utils import DataCollatorPadToMaxLen, split_dataset, shuffle_dataset # noqa: F401 6 | from .DST import add_special_token -------------------------------------------------------------------------------- /applications/DeepSpeed-VisualChat/utils/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_dsvl import create_dsvl_model_and_transforms -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | All benchmarks that use the DeepSpeed library are maintained in this folder. We welcome contributions in this space! 2 | -------------------------------------------------------------------------------- /benchmarks/communication/__init__.py: -------------------------------------------------------------------------------- 1 | '''Copyright The Microsoft DeepSpeed Team''' -------------------------------------------------------------------------------- /benchmarks/communication/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from deepspeed.accelerator import get_accelerator 7 | 8 | DEFAULT_WARMUPS = 5 9 | DEFAULT_TRIALS = 50 10 | DEFAULT_TYPE = 'float' 11 | DEFAULT_BACKEND = get_accelerator().communication_backend_name() 12 | DEFAULT_UNIT = 'Gbps' 13 | DEFAULT_DIST = 'deepspeed' 14 | DEFAULT_MAXSIZE = 24 15 | DEFAULT_DEVICE = 'cuda' 16 | TORCH_DISTRIBUTED_DEFAULT_PORT = 29500 17 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.pyc 3 | *.png 4 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/configs/ddp_config.yaml.template: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | machine_rank: {{ machine_rank }} 5 | main_training_function: main 6 | mixed_precision: bf16 7 | num_machines: {{ num_machines }} 8 | num_processes: {{ num_processes }} 9 | rdzv_backend: static 10 | same_network: true 11 | tpu_env: [] 12 | tpu_use_cluster: false 13 | tpu_use_sudo: false 14 | use_cpu: false 15 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/configs/ds_config.json.template: -------------------------------------------------------------------------------- 1 | { 2 | {% if fp16 %} 3 | "fp16": { 4 | "enabled": true, 5 | "initial_scale_power": 8 6 | }, 7 | {% else %} 8 | "bf16": { 9 | "enabled": true 10 | }, 11 | {% endif %} 12 | "zero_optimization": { 13 | "stage": {{ zero_stage }}, 14 | "sub_group_size": 100000000 15 | }, 16 | "compile": { 17 | "deepcompile": {{ deepcompile }}, 18 | "offload_activation": false, 19 | "offload_opt_states": false, 20 | "double_buffer": true, 21 | "symmetric_memory": false, 22 | "free_activation": false, 23 | "debug_log": {{ debug_log }}, 24 | "sync_before_reduce": {{ sync_before_reduce }}, 25 | "sync_after_reduce": {{ sync_after_reduce }} 26 | }, 27 | "gradient_accumulation_steps": {{ gradient_accumulation_steps }}, 28 | "gradient_clipping": "auto", 29 | "steps_per_print": 2000, 30 | "train_batch_size": "auto", 31 | "train_micro_batch_size_per_gpu": "auto", 32 | "wall_clock_breakdown": false 33 | } -------------------------------------------------------------------------------- /benchmarks/deepcompile/configs/ds_config.yaml.template: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | {%- if zero_stage == 3 %} 6 | zero3_init_flag: true 7 | {%- endif %} 8 | deepspeed_config_file: configs/ds_config.json 9 | distributed_type: DEEPSPEED 10 | machine_rank: {{ machine_rank }} 11 | main_training_function: main 12 | num_machines: {{ num_machines }} 13 | num_processes: {{ num_processes }} 14 | rdzv_backend: static 15 | same_network: true 16 | tpu_env: [] 17 | tpu_use_cluster: false 18 | tpu_use_sudo: false 19 | use_cpu: false -------------------------------------------------------------------------------- /benchmarks/deepcompile/configs/fsdp_config.yaml.template: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | fsdp_config: 5 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 6 | fsdp_backward_prefetch: BACKWARD_PRE 7 | fsdp_cpu_ram_efficient_loading: true 8 | fsdp_forward_prefetch: false 9 | fsdp_offload_params: false 10 | {%- if zero_stage == 3 %} 11 | fsdp_sharding_strategy: FULL_SHARD 12 | {%- else %} 13 | fsdp_sharding_strategy: SHARD_GRAD_OP 14 | {%- endif %} 15 | fsdp_state_dict_type: SHARDED_STATE_DICT 16 | fsdp_sync_module_states: true 17 | fsdp_use_orig_params: true 18 | machine_rank: {{ machine_rank }} 19 | main_training_function: main 20 | mixed_precision: bf16 21 | num_machines: {{ num_machines }} 22 | num_processes: {{ num_processes }} 23 | rdzv_backend: static 24 | same_network: true 25 | tpu_env: [] 26 | tpu_use_cluster: false 27 | tpu_use_sudo: false 28 | use_cpu: false 29 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/configs/singlegpu_config.yaml.template: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: NO 4 | main_training_function: main 5 | mixed_precision: bf16 6 | use_cpu: false 7 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/hostfile_n4: -------------------------------------------------------------------------------- 1 | node-0 slots=8 2 | node-1 slots=8 3 | node-2 slots=8 4 | node-3 slots=8 5 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs1.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs2.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs4.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs2.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs4.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Llama-3-70B_np32_bs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Llama-3-70B_np32_bs1.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png -------------------------------------------------------------------------------- /benchmarks/deepcompile/run_bench_offload.sh: -------------------------------------------------------------------------------- 1 | PROFILE_DIR=${PROFILE_DIR:-"profile_offload"} 2 | mkdir -p ${PROFILE_DIR} 3 | PROFILE_OPTS="--profile --profile-dir ${PROFILE_DIR}" 4 | COMPILE_OPTS="--compile" 5 | DC_OPTS="--compile --deepcompile" 6 | ACC_OPTS="--gradient-accumulation-steps 1" 7 | AC_OPTS="--activation-checkpointing" 8 | 9 | mkdir -p logs 10 | 11 | export LOG_BASE="logs_offload" 12 | mkdir -p ${LOG_BASE} 13 | 14 | MODEL="meta-llama/Meta-Llama-3-70B-Instruct" 15 | BATCH_SIZE_OPTS=(1) 16 | SEQ_LENGTH_OPTS=(1024) 17 | for BATCH_SIZE in ${BATCH_SIZE_OPTS[@]}; do 18 | for SEQ_LENGTH in ${SEQ_LENGTH_OPTS[@]}; do 19 | ARGS="--model ${MODEL} --batch-size ${BATCH_SIZE} --seq-length ${SEQ_LENGTH} ${ACC_OPTS} ${AC_OPTS} ${PROFILE_OPTS}" 20 | bash ./run.sh --backend deepspeed ${ARGS} --zero-stage 3 21 | bash ./run.sh --backend deepspeed ${ARGS} --zero-stage 3 --ds-offload 22 | bash ./run.sh --backend deepspeed ${ARGS} ${DC_OPTS} --zero-stage 3 --eager --passes offload_adam_states 23 | bash ./run.sh --backend deepspeed ${ARGS} ${DC_OPTS} --zero-stage 3 --eager --passes offload_adam_states_sync 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/run_bench_z1.sh: -------------------------------------------------------------------------------- 1 | PROFILE_DIR=${PROFILE_DIR:-profiles} 2 | mkdir -p ${PROFILE_DIR} 3 | PROFILE_OPTS="--profile --profile-dir ${PROFILE_DIR}" 4 | COMPILE_OPTS="--compile" 5 | DC_OPTS="--compile --deepcompile" 6 | ACC_OPTS="--gradient-accumulation-steps 1" 7 | AC_OPTS="--activation-checkpointing" 8 | 9 | export NUM_NODES=${NUM_NODES:-4} 10 | 11 | MODEL="meta-llama/Meta-Llama-3-8B-Instruct" 12 | BATCH_SIZE_OPTS=(1 2 4) 13 | SEQ_LENGTH_OPTS=(512 1024 2048) 14 | for BATCH_SIZE in ${BATCH_SIZE_OPTS[@]}; do 15 | for SEQ_LENGTH in ${SEQ_LENGTH_OPTS[@]}; do 16 | ARGS="--model ${MODEL} --batch-size ${BATCH_SIZE} --seq-length ${SEQ_LENGTH} --zero-stage 1 ${ACC_OPTS} ${AC_OPTS}" 17 | bash ./run_multinode.sh --backend deepspeed ${ARGS} 18 | bash ./run_multinode.sh --backend deepspeed ${ARGS} ${COMPILE_OPTS} 19 | bash ./run_multinode.sh --backend deepspeed ${ARGS} ${DC_OPTS} 20 | 21 | cp -r logs ${PROFILE_DIR}/ 22 | done 23 | done 24 | -------------------------------------------------------------------------------- /benchmarks/deepcompile/run_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $* 4 | 5 | SCRIPT_DIR=$(dirname $(realpath $0)) 6 | HOST_IP=$(hostname -i) 7 | NUM_NODES=${NUM_NODES:-1} 8 | 9 | # verify that NUM_NODES is a positive integer 10 | if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then 11 | echo "Error: NUM_NODES must be a positive integer" 12 | exit 1 13 | fi 14 | 15 | # check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists 16 | if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then 17 | echo "Error: hostfile_n${NUM_NODES} does not exist" 18 | exit 1 19 | fi 20 | 21 | if [ "${NUM_NODES}" == "1" ]; then 22 | # avoid dependency on pdsh when possible 23 | cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $* 24 | else 25 | ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*" 26 | fi 27 | -------------------------------------------------------------------------------- /benchmarks/inference/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/configs/128k-120.yaml: -------------------------------------------------------------------------------- 1 | prompt_length: 128000 2 | prompt_length_var: 0.1 3 | max_prompt_length: 131072 4 | max_new_tokens: 120 5 | max_new_tokens_var: 0.3 6 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/configs/1300-120.yaml: -------------------------------------------------------------------------------- 1 | prompt_length: 1300 2 | prompt_lenght_var: 0.3 3 | max_new_tokens: 120 4 | max_new_tokens_var: 0.3 5 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/configs/2600-60.yaml: -------------------------------------------------------------------------------- 1 | prompt_length: 2600 2 | prompt_lenght_var: 0.3 3 | max_new_tokens: 60 4 | max_new_tokens_var: 0.3 5 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/configs/500-500.yaml: -------------------------------------------------------------------------------- 1 | prompt_length: 500 2 | prompt_lenght_var: 0.3 3 | max_new_tokens: 500 4 | max_new_tokens_var: 0.3 5 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | [project] 5 | name = "deepspeedometer" 6 | version = "0.0.1" 7 | authors = [ 8 | { name="Ammar Ahmad Awan", email="ammar.awan@microsoft.com" }, 9 | { name="Arash Bakhitiari", email="abakhtiari@microsoft.com" }, 10 | { name="Connor Holmes"}, 11 | { name="Lev Kurilenko", email="lev.kurilenko@microsoft.com" }, 12 | { name="Heyang Qin", email="heyangqin@microsoft.com" }, 13 | { name="Masahiro Tanaka", email="mtanaka@microsoft.com" }, 14 | { name="Michael Wyatt", email="michaelwyatt@microsoft.com" }, 15 | ] 16 | description = "LLM benchmarking tool" 17 | readme = "README.md" 18 | requires-python = ">=3.8" 19 | classifiers = [ 20 | "Programming Language :: Python :: 3", 21 | ] 22 | dependencies = [ 23 | "loguru", 24 | "pydantic>=2.0.0", 25 | "torch", 26 | "tqdm", 27 | "transformers", 28 | ] 29 | 30 | [project.urls] 31 | Homepage = "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference/deepspeedometer" 32 | Issues = "https://github.com/deepspeedai/DeepSpeedExamples/issues" 33 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/run_example.sh: -------------------------------------------------------------------------------- 1 | python -m src.deepspeedometer.benchmark_runner --model "facebook/opt-125m" --api dummy --config_file ./configs/1300-120.yaml 2 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py: -------------------------------------------------------------------------------- 1 | from .arg_parsing import parse_args_to_configs 2 | from .benchmark_runner import BenchmarkRunner 3 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseClient 2 | 3 | from .azure_ml_client import AzureMLClientConfig, AzureMLClient 4 | from .dummy_client import DummyClientConfig, DummyClient 5 | from .fastgen_client import FastGenClientConfig, FastGenClient 6 | from .vllm_client import vLLMClientConfig, vLLMClient 7 | from .openai_client import openaiClientConfig, openaiClient 8 | 9 | client_config_classes = { 10 | "dummy": DummyClientConfig, 11 | "azure_ml": AzureMLClientConfig, 12 | "fastgen": FastGenClientConfig, 13 | "vllm": vLLMClientConfig, 14 | "openai": openaiClientConfig 15 | } 16 | client_classes = { 17 | "dummy": DummyClient, 18 | "azure_ml": AzureMLClient, 19 | "fastgen": FastGenClient, 20 | "vllm": vLLMClient, 21 | "openai": openaiClient, 22 | } 23 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict 3 | 4 | from ..config import BaseConfigModel 5 | from ..prompt import Prompt 6 | 7 | 8 | class BaseClient(ABC): 9 | def __init__(self, config: BaseConfigModel) -> None: 10 | self.config = config 11 | 12 | @abstractmethod 13 | def start_service(self) -> None: 14 | pass 15 | 16 | @abstractmethod 17 | def stop_service(self) -> None: 18 | pass 19 | 20 | @abstractmethod 21 | def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: 22 | pass 23 | 24 | @abstractmethod 25 | def send_request(self, request_kwargs: Dict[str, Any]) -> Any: 26 | pass 27 | 28 | @abstractmethod 29 | def process_response(self, raw_response: Any) -> str: 30 | pass 31 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ConfigDict 2 | 3 | 4 | class BaseConfigModel(BaseModel): 5 | model_config = ConfigDict( 6 | validate_default=True, 7 | validate_assignment=False, 8 | use_enum_values=True, 9 | populate_by_name=True, 10 | extra="forbid", 11 | arbitrary_types_allowed=True, 12 | protected_namespaces=(), 13 | ) 14 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | from typing import Any 3 | 4 | 5 | @dataclass 6 | class Response: 7 | prompt_text: str = "" 8 | prompt_tokens: int = 0 9 | generated_output: str = "" 10 | generated_tokens: int = 0 11 | request_time: float = 0 12 | raw_response: Any = None 13 | client_id: int = 0 14 | 15 | def to_dict(self) -> dict: 16 | return asdict(self) 17 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/tests/README.md: -------------------------------------------------------------------------------- 1 | To run the unit tests: 2 | 3 | `python3 -m pytest .` -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/inference/deepspeedometer/tests/__init__.py -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/tests/test_benchmark.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from deepspeedometer import parse_args_to_configs, BenchmarkRunner 4 | 5 | 6 | def test_benchmark_runner(benchmark_args, num_clients): 7 | benchmark_config, client_config = parse_args_to_configs(benchmark_args) 8 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config) 9 | benchmark_runner.run() 10 | 11 | expected_results = sum(1 for _ in benchmark_runner._benchmark_settings()) * len( 12 | num_clients 13 | ) 14 | actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json"))) 15 | assert ( 16 | expected_results == actual_results 17 | ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})." 18 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import yaml 4 | 5 | import pydantic 6 | 7 | from deepspeedometer import BenchmarkRunner, parse_args_to_configs 8 | 9 | 10 | def test_config(benchmark_args): 11 | benchmark_config, client_config = parse_args_to_configs(benchmark_args) 12 | 13 | 14 | @pytest.mark.parametrize("model", [""]) 15 | def test_config_required_fail(benchmark_args): 16 | with pytest.raises(pydantic.ValidationError): 17 | benchmark_config, client_config = parse_args_to_configs(benchmark_args) 18 | 19 | 20 | @pytest.mark.parametrize("num_config_files", [1]) 21 | def test_config_file(benchmark_args, config_files, num_clients): 22 | # Create a config that would generate 6 benchmark settings 23 | config = {"max_prompt_length": [500, 1300, 2600], "num_clients": [1, 2]} 24 | with open(config_files[0], "w") as f: 25 | yaml.dump(config, f) 26 | 27 | benchmark_config, client_config = parse_args_to_configs(benchmark_args) 28 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config) 29 | benchmark_settings = sum(1 for _ in benchmark_runner._benchmark_settings()) * len( 30 | num_clients 31 | ) 32 | assert benchmark_settings == 6 33 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/tests/test_early_stop.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from deepspeedometer import parse_args_to_configs, BenchmarkRunner 4 | 5 | 6 | @pytest.mark.parametrize("num_clients", [(1, 2, 4)], indirect=True) 7 | def test_early_stop(benchmark_args): 8 | benchmark_args += [ 9 | "--early_stop_latency", 10 | "1", 11 | "--dummy_client_latency_time", 12 | "2.0", 13 | ] 14 | print(benchmark_args) 15 | benchmark_config, client_config = parse_args_to_configs(benchmark_args) 16 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config) 17 | benchmark_runner.run() 18 | 19 | expected_results = 1 20 | actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json"))) 21 | assert ( 22 | expected_results == actual_results 23 | ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})." 24 | -------------------------------------------------------------------------------- /benchmarks/inference/deepspeedometer/tests/test_prompt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from deepspeedometer import BenchmarkRunner, parse_args_to_configs 4 | 5 | 6 | @pytest.mark.parametrize("prompt_length_var, max_new_tokens_var", [(0, 0)]) 7 | def test_prompt_length(benchmark_args): 8 | benchmark_config, client_config = parse_args_to_configs(benchmark_args) 9 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config) 10 | num_clients, prompt_config = next(benchmark_runner._benchmark_settings()) 11 | 12 | for prompt in benchmark_runner.prompt_generator(prompt_config, num_prompts=10): 13 | prompt_length = benchmark_runner.prompt_generator.count_tokens(prompt.text) 14 | # Using pytest.approx here because often we will have 1-off errors due to tokenization special tokens 15 | assert prompt_length == pytest.approx(benchmark_runner.config.prompt_length, 1) 16 | -------------------------------------------------------------------------------- /benchmarks/inference/mii/A6000_benchmarks_example.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/inference/mii/A6000_benchmarks_example.PNG -------------------------------------------------------------------------------- /benchmarks/inference/mii/plot_config.yaml: -------------------------------------------------------------------------------- 1 | label: "vLLM" 2 | color: "purple" 3 | marker: "o" 4 | linestyle: "--" 5 | polyfit_degree: 0 6 | x_max : 30 7 | y_max : 10 8 | -------------------------------------------------------------------------------- /benchmarks/inference/mii/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | matplotlib 3 | deepspeed-mii>=0.2.0 4 | vllm>=0.2.7 5 | numpy 6 | tabulate 7 | -------------------------------------------------------------------------------- /benchmarks/inference/mii/run_all.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1) 7 | 8 | for MODEL in ${MODELS[@]}; do 9 | python ./run_benchmark.py --model ${MODEL} --stream --backend fastgen 10 | python ./run_benchmark.py --model ${MODEL} --stream --backend vllm 11 | done 12 | 13 | # Extra runs for Mixtral with non-default settings 14 | python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend fastgen 15 | python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend vllm -------------------------------------------------------------------------------- /benchmarks/inference/mii/run_aml.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | # Run benchmark against AML endpoint 7 | python ./run_benchmark.py \ 8 | --model \ 9 | --deployment_name \ 10 | --aml_api_url \ 11 | --aml_api_key \ 12 | --mean_prompt_length 2600 \ 13 | --mean_max_new_tokens 60 \ 14 | --num_requests 256 \ 15 | --backend aml 16 | 17 | ### Gernerate the plots 18 | python ./src/plot_th_lat.py 19 | 20 | echo "Find figures in ./plots/ and log outputs in ./results/" -------------------------------------------------------------------------------- /benchmarks/inference/mii/run_example.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | # Run benchmark 7 | python ./run_benchmark.py \ 8 | --model meta-llama/Llama-2-7b-hf \ 9 | --tp_size 1 \ 10 | --num_replicas 1 \ 11 | --max_ragged_batch_size 768 \ 12 | --mean_prompt_length 2600 \ 13 | --mean_max_new_tokens 60 \ 14 | --stream \ 15 | --backend fastgen \ 16 | 17 | ### Gernerate the plots 18 | python ./src/plot_th_lat.py 19 | 20 | echo "Find figures in ./plots/ and log outputs in ./results/" -------------------------------------------------------------------------------- /benchmarks/inference/mii/run_fp6.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | MODELS=(NousResearch/Llama-2-70b-hf) 7 | 8 | for MODEL in ${MODELS[@]}; do 9 | python ./run_benchmark.py --model ${MODEL} --num_requests 128 --stream --backend fastgen --fp6 --tp_size 1 10 | done -------------------------------------------------------------------------------- /benchmarks/inference/mii/src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /benchmarks/inference/mii/src/random_query_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import numpy as np 7 | import torch 8 | import random 9 | 10 | 11 | class RandomQueryGenerator: 12 | def __init__(self, input_text, tokenizer, seed): 13 | self.input_text = input_text 14 | self.tokenizer = tokenizer 15 | 16 | torch.manual_seed(seed) 17 | random.seed(seed) 18 | np.random.seed(seed) 19 | 20 | def get_random_request_text(self, length, variance, max_length, batch): 21 | request_text = [] 22 | tokenized_input = self.tokenizer.batch_encode_plus( 23 | [self.input_text], return_tensors="pt", padding=False 24 | ) 25 | offset = list(range(512)) 26 | random.shuffle(offset) 27 | 28 | text_ids = tokenized_input["input_ids"][0] 29 | for i in range(batch): 30 | # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens 31 | req_prompt_length = min(int(np.random.normal(length, variance)), max_length) 32 | 33 | text = self.tokenizer.decode(text_ids[i : req_prompt_length + i]) 34 | request_text.append(text) 35 | return request_text 36 | -------------------------------------------------------------------------------- /benchmarks/inference/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.21.3 2 | -------------------------------------------------------------------------------- /benchmarks/inference/run_model.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | model=$1 4 | dtype=$2 5 | graphs=$3 6 | kernel=$4 7 | gpus=$5 8 | 9 | version=0 10 | log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version} 11 | mkdir -p ${log_path} 12 | 13 | params="--dtype $dtype " 14 | if [[ "$graphs" == "true" ]]; then 15 | params+="--graphs " 16 | fi 17 | if [[ "$kernel" == "true" ]]; then 18 | params+="--kernel " 19 | fi 20 | 21 | echo "baseline $log_path" 22 | deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log 23 | 24 | echo "deepspeed $log_path" 25 | deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/deepspeed.log -------------------------------------------------------------------------------- /compression/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Model Compression examples 2 | 3 | Examples in this folder are helpful to try out some features and models that take advantage of the DeepSpeed compression library. 4 | 5 | A detailed tutorial for understanding and using DeepSpeed model compression features can be seen from here: https://www.deepspeed.ai/tutorials/model-compression/ 6 | -------------------------------------------------------------------------------- /compression/bert/README.md: -------------------------------------------------------------------------------- 1 | #### Install 2 | 3 | ``pip install -r requirements.txt`` 4 | 5 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library. 6 | 7 | #### Key File: run_glue_no_trainer.py 8 | 9 | The python code is modified based on [HuggingFace's PyTorch text_classification](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification). The key added feature is the implementation of knowledge distillation (KD)(--distill_method one_stage). If no KD, run (--distill_method zero_stage). 10 | 11 | #### Folders (config, huggingface_transformer, bash_script) 12 | 13 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction. 14 | * **huggingface_transformer:** This folder serves the implementation of knowledge distillation. It's based on [HuggingFace's transformer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) 15 | The change is line 383, where we output attention_scores instead of attention_prob. 16 | * **bash_script** This folder contains many bash scripts for various kinds of compression. See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/). 17 | 18 | -------------------------------------------------------------------------------- /compression/bert/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | transformers == 4.15.0 3 | datasets >= 1.8.0 4 | sentencepiece != 0.1.92 5 | scipy 6 | scikit-learn 7 | protobuf 8 | gpustat 9 | torch >= 1.3 10 | -------------------------------------------------------------------------------- /compression/cifar/README.md: -------------------------------------------------------------------------------- 1 | #### Install 2 | 3 | ``pip install torch torchvision`` 4 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library. 5 | 6 | #### Key File: train.py 7 | 8 | The python code is modified based on (https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar). The key added feature is the compression pipeline. 9 | 10 | #### Folders (config) 11 | 12 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction. 13 | 14 | #### bash script 15 | * **run_compress.sh** This bash script contains jobs for training a checkpoint and then compressing this checkpoint. See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/). 16 | 17 | -------------------------------------------------------------------------------- /compression/cifar/config/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 32, 3 | "train_micro_batch_size_per_gpu": 32, 4 | "steps_per_print": 50, 5 | 6 | "optimizer": { 7 | "type": "Adam", 8 | "params": { 9 | "lr": 0.001, 10 | "betas": [ 11 | 0.8, 12 | 0.999 13 | ], 14 | "eps": 1e-8, 15 | "weight_decay": 3e-7 16 | } 17 | }, 18 | 19 | "zero_optimization": { 20 | "stage": 0 21 | }, 22 | 23 | "fp16":{ 24 | "enabled": true 25 | }, 26 | 27 | "gradient_clipping": 1.0, 28 | "prescale_gradients": true, 29 | 30 | "wall_clock_breakdown" : false 31 | } 32 | 33 | -------------------------------------------------------------------------------- /compression/gpt2/README.md: -------------------------------------------------------------------------------- 1 | #### Install 2 | 3 | ``pip install -r requirements.txt`` 4 | 5 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library. 6 | 7 | 8 | #### Key File: run_clm_no_trainer.py 9 | 10 | The python code is modified based on huggingface (https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py). The key added feature is the compression pipeline. 11 | 12 | #### Folders (config) 13 | 14 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction. 15 | 16 | #### bash script 17 | * **run_zero_quant.sh** This bash script contains jobs for training a checkpoint and then compressing this checkpoint. Run the job under the gpt2 directory: 18 | 19 | ```DeepSpeedExamples/model_compression/gpt2$ . ./bash_script/run_zero_quant.sh``` 20 | See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/). 21 | 22 | -------------------------------------------------------------------------------- /compression/gpt2/config/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 8, 3 | "train_micro_batch_size_per_gpu": 4, 4 | "steps_per_print": 50, 5 | 6 | "optimizer": { 7 | "type": "Adam", 8 | "params": { 9 | "lr": 0.001, 10 | "betas": [ 11 | 0.8, 12 | 0.999 13 | ], 14 | "eps": 1e-8, 15 | "weight_decay": 3e-7 16 | } 17 | }, 18 | 19 | "zero_optimization": { 20 | "stage": 0 21 | }, 22 | 23 | "fp16":{ 24 | "enabled": true 25 | }, 26 | 27 | "gradient_clipping": 1.0, 28 | "prescale_gradients": true, 29 | 30 | "wall_clock_breakdown" : false 31 | } 32 | 33 | -------------------------------------------------------------------------------- /compression/gpt2/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.8.0 2 | sentencepiece != 0.1.92 3 | protobuf 4 | transformers == 4.15.0 5 | accelerate -------------------------------------------------------------------------------- /deepnvme/file_access/aio_load_gpu_tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os, timeit, functools 3 | from deepspeed.ops.op_builder import AsyncIOBuilder 4 | from utils import parse_read_arguments, GIGA_UNIT 5 | 6 | def file_read(inp_f, handle, bounce_buffer): 7 | handle.sync_pread(bounce_buffer, inp_f) 8 | return bounce_buffer.cuda() 9 | 10 | 11 | def main(): 12 | args = parse_read_arguments() 13 | input_file = args.input_file 14 | file_sz = os.path.getsize(input_file) 15 | cnt = args.loop 16 | 17 | aio_handle = AsyncIOBuilder().load().aio_handle() 18 | bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory() 19 | 20 | t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer)) 21 | aio_t = t.timeit(cnt) 22 | aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t 23 | print(f'aio load_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec') 24 | 25 | if args.validate: 26 | from py_load_cpu_tensor import file_read as py_file_read 27 | aio_tensor = file_read(input_file, aio_handle, bounce_buffer).cpu() 28 | py_tensor = py_file_read(input_file) 29 | print(f'Validation success = {aio_tensor.equal(py_tensor)}') 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /deepnvme/file_access/media/deepnvme_ops_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/file_access/media/deepnvme_ops_report.png -------------------------------------------------------------------------------- /deepnvme/file_access/py_load_cpu_tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os, timeit, functools 3 | from utils import parse_read_arguments, GIGA_UNIT 4 | 5 | def file_read(inp_f): 6 | with open(inp_f, 'rb') as f: 7 | tensor = torch.frombuffer(f.read(), dtype=torch.uint8) 8 | return tensor 9 | 10 | def main(): 11 | args = parse_read_arguments() 12 | input_file = args.input_file 13 | file_sz = os.path.getsize(input_file) 14 | cnt = args.loop 15 | 16 | t = timeit.Timer(functools.partial(file_read, input_file)) 17 | py_t = t.timeit(cnt) 18 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t 19 | print(f'py load_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /deepnvme/file_access/py_load_gpu_tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os, timeit, functools 3 | from utils import parse_read_arguments, GIGA_UNIT 4 | 5 | def file_read(inp_f): 6 | with open(inp_f, 'rb') as f: 7 | tensor = torch.frombuffer(f.read(), dtype=torch.uint8) 8 | return tensor.cuda() 9 | 10 | def main(): 11 | args = parse_read_arguments() 12 | input_file = args.input_file 13 | file_sz = os.path.getsize(input_file) 14 | cnt = args.loop 15 | 16 | t = timeit.Timer(functools.partial(file_read, input_file)) 17 | py_t = t.timeit(cnt) 18 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t 19 | print(f'py load_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /deepnvme/file_access/py_store_cpu_tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os, timeit, functools 3 | import pathlib 4 | from utils import parse_write_arguments, GIGA_UNIT 5 | 6 | def file_write(out_f, tensor): 7 | with open(out_f, 'wb') as f: 8 | f.write(tensor.numpy(force=True)) 9 | 10 | def main(): 11 | args = parse_write_arguments() 12 | cnt = args.loop 13 | output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') 14 | pathlib.Path(output_file).unlink(missing_ok=True) 15 | file_sz = args.mb_size*(1024**2) 16 | cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False) 17 | 18 | t = timeit.Timer(functools.partial(file_write, output_file, cpu_tensor)) 19 | 20 | py_t = t.timeit(cnt) 21 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t 22 | print(f'py store_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') 23 | pathlib.Path(output_file).unlink(missing_ok=True) 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /deepnvme/file_access/py_store_gpu_tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os, timeit, functools 3 | import pathlib 4 | from utils import parse_write_arguments, GIGA_UNIT 5 | 6 | def file_write(out_f, tensor): 7 | with open(out_f, 'wb') as f: 8 | f.write(tensor.numpy(force=True)) 9 | 10 | def main(): 11 | args = parse_write_arguments() 12 | cnt = args.loop 13 | output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') 14 | pathlib.Path(output_file).unlink(missing_ok=True) 15 | file_sz = args.mb_size*(1024**2) 16 | gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False) 17 | 18 | t = timeit.Timer(functools.partial(file_write, output_file, gpu_tensor)) 19 | 20 | py_t = t.timeit(cnt) 21 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t 22 | print(f'py store_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') 23 | pathlib.Path(output_file).unlink(missing_ok=True) 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /deepnvme/file_access/run_load_tensor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -ne 1 ]]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | input_file=$1 9 | if ! [[ -f "$input_file" ]]; then 10 | echo "Error: $input_file does not exist" 11 | exit 1 12 | fi 13 | 14 | 15 | echo "Running load tensor examples using $input_file" 16 | for f in aio_load_cpu_tensor.py aio_load_gpu_tensor.py \ 17 | gds_load_gpu_tensor.py \ 18 | py_load_cpu_tensor.py py_load_gpu_tensor.py; do 19 | cmd="python $f --input_file $input_file" 20 | sync 21 | echo $cmd 22 | eval $cmd 23 | sleep 2 24 | done 25 | 26 | 27 | -------------------------------------------------------------------------------- /deepnvme/file_access/run_store_tensor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -ne 1 ]]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | output_folder=$1 9 | if ! [[ -d "$output_folder" ]]; then 10 | echo "Error: $output_folder does not exist" 11 | exit 1 12 | fi 13 | 14 | 15 | echo "Running store tensor examples using $output_folder" 16 | for f in aio_store_cpu_tensor.py aio_store_gpu_tensor.py \ 17 | gds_store_gpu_tensor.py \ 18 | py_store_cpu_tensor.py py_store_gpu_tensor.py; do 19 | cmd="python $f --nvme_folder $output_folder" 20 | sync 21 | echo $cmd 22 | eval $cmd 23 | sleep 2 24 | done 25 | 26 | 27 | -------------------------------------------------------------------------------- /deepnvme/model_checkpoint/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | -------------------------------------------------------------------------------- /deepnvme/zero_inference/media/nvme_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/nvme_config.png -------------------------------------------------------------------------------- /deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png -------------------------------------------------------------------------------- /deepnvme/zero_inference/media/zero_inf_mem_use_gds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png -------------------------------------------------------------------------------- /inference/huggingface/automatic-speech-recognition/README.md: -------------------------------------------------------------------------------- 1 | 2 | # DeepSpeed Huggingface Automatic Speech Recognition Examples 3 | 4 | # Setup 5 | Python dependencies: 6 |
 7 | pip install -r requirements.txt
 8 | 
9 | 10 | For the `test-wav2vec.py` speech model example, you may also need to install the `libsndfile1-dev` generic library: 11 |
12 | sudo apt-get install libsndfile1-dev
13 | 
14 | 15 | # Usage 16 | Examples can be run as follows: 17 |
deepspeed --num_gpus [number of GPUs] test-[model].py
18 | 19 | # Example Output 20 | Command: 21 |
22 | deepspeed --num_gpus 1 test-wav2vec2.py
23 | 
24 | 25 | Output: 26 |
27 | WER: 0.03383673158855752
28 | 
29 | -------------------------------------------------------------------------------- /inference/huggingface/automatic-speech-recognition/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | transformers==4.21.2 4 | soundfile 5 | jiwer 6 | datasets 7 | -------------------------------------------------------------------------------- /inference/huggingface/fill-mask/README.md: -------------------------------------------------------------------------------- 1 | 2 | # DeepSpeed Huggingface Fill Mask Examples 3 | 4 | # Setup 5 | Python dependencies: 6 |
 7 | pip install -r requirements.txt
 8 | 
9 | 10 | # Usage 11 | Examples can be run as follows: 12 |
deepspeed --num_gpus [number of GPUs] test-[model].py
13 | 14 | # Example Output 15 | Command: 16 |
17 | deepspeed --num_gpus 1 test-roberta.py
18 | 
19 | 20 | Output: 21 |
22 | [{'score': 0.40290409326553345, 'token': 3742, 'token_str': ' Internet', 'sequence': 'The invention of the Internet revolutionized the way we communicate with each other.'}, {'score': 0.20314466953277588, 'token': 7377, 'token_str': ' telephone', 'sequence': 'The invention of the telephone revolutionized the way we communicate with each other.'}, {'score': 0.17653286457061768, 'token': 2888, 'token_str': ' internet', 'sequence': 'The invention of the internet revolutionized the way we communicate with each other.'}, {'score': 0.06900821626186371, 'token': 4368, 'token_str': ' smartphone', 'sequence': 'The invention of the smartphone revolutionized the way we communicate with each other.'}, {'score': 0.03270129859447479, 'token': 3034, 'token_str': ' computer', 'sequence': 'The invention of the computer revolutionized the way we communicate with each other.'}]
23 | 
24 | -------------------------------------------------------------------------------- /inference/huggingface/fill-mask/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | transformers==4.21.2 4 | -------------------------------------------------------------------------------- /inference/huggingface/fill-mask/test-electra.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | import transformers 3 | import deepspeed 4 | import torch 5 | import os 6 | from transformers.models.electra.modeling_electra import ElectraLayer 7 | from deepspeed.accelerator import get_accelerator 8 | 9 | local_rank = int(os.getenv('LOCAL_RANK', '0')) 10 | world_size = int(os.getenv('WORLD_SIZE', '4')) 11 | 12 | pipe = pipeline('fill-mask', model="google/electra-base-generator", 13 | tokenizer="google/electra-base-generator") 14 | 15 | # The injection_policy shows two things: 16 | # 1. which layer module we need to add Tensor-Parallelism 17 | # 2. the name of one or several linear layers: a) attention_output (both encoder and decoder), 18 | # and b) transformer output 19 | pipe.model = deepspeed.init_inference( 20 | pipe.model, 21 | mp_size=world_size, 22 | dtype=torch.float, 23 | injection_policy={ElectraLayer: ('output.dense')} 24 | ) 25 | pipe.device = torch.device(get_accelerator().device_name(local_rank)) 26 | output = pipe(f"HuggingFace is creating a {pipe.tokenizer.mask_token} that the community uses to solve NLP tasks.") 27 | 28 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: 29 | print(output) 30 | -------------------------------------------------------------------------------- /inference/huggingface/fill-mask/test-roberta.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | import transformers 3 | import deepspeed 4 | import torch 5 | import os 6 | from transformers.models.roberta.modeling_roberta import RobertaLayer 7 | from deepspeed.accelerator import get_accelerator 8 | 9 | local_rank = int(os.getenv('LOCAL_RANK', '0')) 10 | world_size = int(os.getenv('WORLD_SIZE', '4')) 11 | 12 | pipe = pipeline('fill-mask', model="roberta-large", device=local_rank) 13 | 14 | # The injection_policy shows two things: 15 | # 1. which layer module we need to add Tensor-Parallelism 16 | # 2. the name of several linear layers: a) attention_output (both encoder and decoder), 17 | # and b) transformer output 18 | 19 | pipe.model = deepspeed.init_inference( 20 | pipe.model, 21 | mp_size=world_size, 22 | dtype=torch.float, 23 | injection_policy={RobertaLayer: ('output.dense')} 24 | ) 25 | 26 | pipe.device = torch.device(get_accelerator().device_name(local_rank)) 27 | output = pipe("The invention of the revolutionized the way we communicate with each other.") 28 | 29 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: 30 | print(output) 31 | -------------------------------------------------------------------------------- /inference/huggingface/stable-diffusion/README.md: -------------------------------------------------------------------------------- 1 | 2 | # DeepSpeed Stable Diffusion Example 3 | 4 | # Setup 5 | Python dependencies: 6 |
 7 | pip install -r requirements.txt
 8 | 
9 | 10 | # Usage 11 | Examples can be run as follows: 12 |
deepspeed --num_gpus [number of GPUs] test-[model].py
13 | 14 | NOTE: Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1`. 15 | 16 | # Example Output 17 | Command: 18 |
19 | deepspeed --num_gpus 1 test-stable-diffusion.py
20 | 
21 | 22 | Output: 23 |
24 | ./baseline.png
25 | ./deepspeed.png
26 | 
27 | -------------------------------------------------------------------------------- /inference/huggingface/stable-diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | diffusers>=0.22.3 4 | triton==2.0.0.dev20221202 5 | -------------------------------------------------------------------------------- /inference/huggingface/text-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | transformers==4.28.1 4 | -------------------------------------------------------------------------------- /inference/huggingface/text-generation/run-generation-script/README.md: -------------------------------------------------------------------------------- 1 | 2 | # DeepSpeed Huggingface Text Generation Script 3 | 4 | # Setup 5 | Python dependencies: 6 |
 7 | pip install -r requirements.txt
 8 | 
9 | 10 | # Usage 11 | The [`test-run-generation.py`](./test-run-generation.py) example can be run using [test-gpt.sh](./test-gpt.sh), which serves as an example of how to run the script. 12 |
13 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
14 |     --model_type=gpt2 \
15 |     --model_name_or_path=gpt2-xl \
16 |     --sample_input single_query.txt \
17 |     --fp16 \
18 |     --ds-inference
19 | 
20 | 21 | # Example Output 22 | Command: 23 |
24 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
25 |     --model_type=gpt2 \
26 |     --model_name_or_path=gpt2-xl \
27 |     --sample_input single_query.txt \
28 |     --fp16 \
29 |     --ds-inference
30 | 
31 | 32 | Output: 33 |
34 | === GENERATED SEQUENCE 1 ===
35 | What is DeepSpeed?
36 | 
37 | DeepSpeed is a multi-dimensional data compression framework designed to achieve high compression ratio on human readable
38 | 
39 | -------------------------------------------------------------------------------- /inference/huggingface/text-generation/run-generation-script/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | transformers==4.21.2 4 | numpy 5 | sentencepiece 6 | protobuf 7 | -------------------------------------------------------------------------------- /inference/huggingface/text-generation/run-generation-script/single_query.txt: -------------------------------------------------------------------------------- 1 | What is DeepSpeed? 2 | -------------------------------------------------------------------------------- /inference/huggingface/text-generation/run-generation-script/test-gpt.sh: -------------------------------------------------------------------------------- 1 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \ 2 | --model_type=gpt2 \ 3 | --model_name_or_path=gpt2-xl \ 4 | --sample_input single_query.txt \ 5 | --fp16 \ 6 | --ds-inference 7 | -------------------------------------------------------------------------------- /inference/huggingface/text2text-generation/README.md: -------------------------------------------------------------------------------- 1 | 2 | # DeepSpeed Huggingface Text2Text Generation Examples 3 | 4 | # Setup 5 | Python dependencies: 6 |
 7 | pip install -r requirements.txt
 8 | 
9 | 10 | # Usage 11 | Examples can be run as follows: 12 |
deepspeed --num_gpus [number of GPUs] test-[model].py
13 | 14 | # Example Output 15 | Command: 16 |
17 | deepspeed --num_gpus 1 test-t5.py
18 | 
19 | 20 | Output: 21 |
22 | [{'generated_text': 'd review: this is the best cast iron skillet. Great review! Great review! Great'}]
23 | 
24 | -------------------------------------------------------------------------------- /inference/huggingface/text2text-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | transformers==4.21.2 4 | sentencepiece 5 | google 6 | protobuf 7 | -------------------------------------------------------------------------------- /inference/huggingface/text2text-generation/test-t5.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | import transformers 3 | import deepspeed 4 | import torch 5 | import os 6 | from transformers.models.t5.modeling_t5 import T5Block 7 | 8 | local_rank = int(os.getenv('LOCAL_RANK', '0')) 9 | world_size = int(os.getenv('WORLD_SIZE', '4')) 10 | 11 | pipe = pipeline("text2text-generation", model="google/t5-v1_1-small", device=local_rank) 12 | 13 | # The injection_policy shows two things: 14 | # 1. which layer module we need to add Tensor-Parallelism 15 | # 2. the name of several linear layers: a) attention_output (both encoder and decoder), 16 | # and b) transformer output 17 | 18 | pipe.model = deepspeed.init_inference( 19 | pipe.model, 20 | mp_size=world_size, 21 | dtype=torch.float, 22 | injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')} 23 | ) 24 | 25 | pipe.device = torch.device(f'cuda:{local_rank}') 26 | output = pipe("Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy") 27 | 28 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: 29 | print(output) 30 | -------------------------------------------------------------------------------- /inference/huggingface/translation/README.md: -------------------------------------------------------------------------------- 1 | 2 | # DeepSpeed Huggingface Translation Examples 3 | 4 | # Setup 5 | Python dependencies: 6 |
 7 | pip install -r requirements.txt
 8 | 
9 | 10 | # Usage 11 | Examples can be run as follows: 12 |
deepspeed --num_gpus [number of GPUs] test-[model].py
13 | 14 | # Example Output 15 | Command: 16 |
17 | deepspeed --num_gpus 1 test-t5-base.py
18 | 
19 | 20 | Output: 21 |
22 | [{'translation_text': 'Le renard brun rapide saute au-dessus du chien lazy.'}]
23 | 
24 | -------------------------------------------------------------------------------- /inference/huggingface/translation/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | torch 3 | transformers==4.21.2 4 | sentencepiece 5 | google 6 | protobuf 7 | -------------------------------------------------------------------------------- /inference/huggingface/translation/test-t5-base.py: -------------------------------------------------------------------------------- 1 | import deepspeed 2 | import torch 3 | import os 4 | from transformers import pipeline 5 | from transformers.models.t5.modeling_t5 import T5Block 6 | 7 | local_rank = int(os.getenv('LOCAL_RANK', '0')) 8 | world_size = int(os.getenv('WORLD_SIZE', '4')) 9 | 10 | # Init translator 11 | translator = pipeline("translation_en_to_fr", model="t5-base", tokenizer="t5-base", device=local_rank) 12 | 13 | # DeepSpeed init_inference API 14 | translator.model = deepspeed.init_inference(translator.model, 15 | mp_size=world_size, 16 | dtype=torch.float, 17 | injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')} 18 | ) 19 | 20 | # Translate text 21 | text = "The quick brown fox jumps over the lazy dog." 22 | translation = translator(text) 23 | 24 | # Print translation 25 | print(translation) 26 | -------------------------------------------------------------------------------- /inference/huggingface/zero_inference/images/over_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/inference/huggingface/zero_inference/images/over_v1.png -------------------------------------------------------------------------------- /inference/huggingface/zero_inference/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed>=0.10.1 2 | torch 3 | transformers @ git+https://github.com/tjruwase/transformers@kvcache-offload-cpu 4 | packaging 5 | accelerate 6 | -------------------------------------------------------------------------------- /inference/huggingface/zero_inference/run_llama2_70b_a6000.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export USE_TF=0 3 | BASE_LOG_DIR=~/experiments/zero_inference/ 4 | MODEL_NAME="Llama-2-70b-hf" 5 | FULL_MODEL_NAME="meta-llama/${MODEL_NAME}" 6 | QB=4 7 | 8 | BSZ=64 9 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ} 10 | mkdir -p $LOG_DIR 11 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin.txt 12 | 13 | BSZ=96 14 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ} 15 | mkdir -p $LOG_DIR 16 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin_q${QB}.txt 17 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv.txt 18 | 19 | 20 | BSZ=200 21 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ} 22 | mkdir -p $LOG_DIR 23 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv_q${QB}.txt 24 | -------------------------------------------------------------------------------- /inference/mii/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed MII Examples 2 | 3 | Install the requirements by running `pip install -r requirements.txt`. 4 | 5 | Once [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. See the scripts in [non-persistent](./non-persistent/) and [persistent](./persistent/) for examples. Details on the code implemented in these scripts can be found on our [Getting Started guide for MII](https://github.com/deepspeedai/DeepSpeed-mii#getting-started-with-mii). 6 | -------------------------------------------------------------------------------- /inference/mii/non-persistent/README.md: -------------------------------------------------------------------------------- 1 | # Non-Persistent Pipeline Examples 2 | 3 | The `pipeline.py` script can be used to run any of the [supported 4 | models](https://github.com/deepspeedai/DeepSpeed-mii#supported-models). Provide 5 | the HuggingFace model name, maximum generated tokens, and prompt(s). The 6 | generated responses will be printed in the terminal: 7 | 8 | ```shell 9 | $ python pipeline.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is" 10 | ``` 11 | 12 | Tensor-parallelism can be controlled using the `deepspeed` launcher and setting 13 | `--num_gpus`: 14 | 15 | ```shell 16 | $ deepspeed --num_gpus 2 pipeline.py 17 | ``` 18 | 19 | ## Model-Specific Examples 20 | 21 | For convenience, we also provide a set of scripts to quickly test the MII 22 | Pipeline with some popular text-generation models: 23 | 24 | | Model | Launch command | 25 | |-------|----------------| 26 | | [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) | `$ python llama2.py` | 27 | | [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) | `$ python falcon.py` | 28 | | [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | `$ deepspeed --num_gpus 2 mixtral.py` | -------------------------------------------------------------------------------- /inference/mii/non-persistent/falcon.py: -------------------------------------------------------------------------------- 1 | import mii 2 | 3 | pipe = mii.pipeline("tiiuae/falcon-7b") 4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True) 5 | if pipe.is_rank_0: 6 | print(responses[0]) 7 | -------------------------------------------------------------------------------- /inference/mii/non-persistent/llama2.py: -------------------------------------------------------------------------------- 1 | import mii 2 | 3 | pipe = mii.pipeline("meta-llama/Llama-2-7b-hf") 4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True) 5 | if pipe.is_rank_0: 6 | print(responses[0]) 7 | -------------------------------------------------------------------------------- /inference/mii/non-persistent/mixtral.py: -------------------------------------------------------------------------------- 1 | import mii 2 | 3 | pipe = mii.pipeline("mistralai/Mixtral-8x7B-v0.1") 4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True) 5 | if pipe.is_rank_0: 6 | print(responses[0]) 7 | -------------------------------------------------------------------------------- /inference/mii/non-persistent/pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mii 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") 6 | parser.add_argument( 7 | "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"] 8 | ) 9 | parser.add_argument("--max-new-tokens", type=int, default=128) 10 | args = parser.parse_args() 11 | 12 | pipe = mii.pipeline(args.model) 13 | responses = pipe( 14 | args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True 15 | ) 16 | 17 | if pipe.is_rank_0: 18 | for r in responses: 19 | print(r, "\n", "-" * 80, "\n") 20 | -------------------------------------------------------------------------------- /inference/mii/persistent/README.md: -------------------------------------------------------------------------------- 1 | # Persistent Deployment Examples 2 | 3 | The `serve.py` script can be used to create an inference server for any of the 4 | [supported models](https://github.com/deepspeedai/DeepSpeed-mii#supported-models). 5 | Provide the HuggingFace model name and tensor-parallelism (use the default 6 | values and run `$ python serve.py` for a single-GPU 7 | [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) 8 | deployment): 9 | 10 | ```shell 11 | $ python serve.py --model "mistralai/Mistral-7B-v0.1" tensor-parallel 1 12 | ``` 13 | 14 | Connect to the persistent deployment and generate text with `client.py`. Provide 15 | the HuggingFace model name, maximum generated tokens, and prompt(s) (or if you 16 | are using the default values, run `$ python client.py`): 17 | 18 | ```shell 19 | $ python client.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is" 20 | ``` 21 | 22 | Shutdown the persistent deployment with `terminate.py`. Provide the HuggingFace 23 | model name (or if you are using the default values, run `$ python 24 | terminate.py`): 25 | 26 | ```shell 27 | $ python terminate.py --model "mistralai/Mistral-7B-v0.1 28 | ``` -------------------------------------------------------------------------------- /inference/mii/persistent/client.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mii 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") 6 | parser.add_argument( 7 | "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"] 8 | ) 9 | parser.add_argument("--max-new-tokens", type=int, default=128) 10 | args = parser.parse_args() 11 | 12 | client = mii.client(args.model) 13 | responses = client( 14 | args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True 15 | ) 16 | 17 | for r in responses: 18 | print(r, "\n", "-" * 80, "\n") 19 | -------------------------------------------------------------------------------- /inference/mii/persistent/serve.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mii 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") 6 | parser.add_argument("--tensor-parallel", type=int, default=1) 7 | args = parser.parse_args() 8 | 9 | mii.serve(args.model, tensor_parallel=args.tensor_parallel) 10 | 11 | print(f"Serving model {args.model} on {args.tensor_parallel} GPU(s).") 12 | print(f"Run `python client.py --model {args.model}` to connect.") 13 | print(f"Run `python terminate.py --model {args.model}` to terminate.") 14 | -------------------------------------------------------------------------------- /inference/mii/persistent/terminate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mii 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") 6 | args = parser.parse_args() 7 | 8 | client = mii.client(args.model) 9 | client.terminate_server() 10 | 11 | print(f"Terminated server for model {args.model}.") 12 | -------------------------------------------------------------------------------- /inference/mii/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed-mii>=0.1.3 2 | -------------------------------------------------------------------------------- /inference/sglang/README.md: -------------------------------------------------------------------------------- 1 | # SGLang + ZeRO-Inference Examples 2 | This folder contains examples of [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) integration into [SGLang](https://github.com/sgl-project/sglang) framework. This integration enable SGLang to inference massive models (e.g., with 100s billion parameters) on a single GPU through the NVMe/CPU offloading optimizations of ZeRO-Inference. 3 | 4 | ## Prerequisites 5 | 1. DeepSpeed version >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6) 6 | 2. SGLang: These examples require our SGLang [fork](https://github.com/tjruwase/sglang/tree/zero-inference). We plan to upstream the SGLang changes to main branch. 7 | 8 | 9 | ## Examples 10 | The examples comprise of the following: 11 | 1. bash scripts that benchmark SGLang throughput in [offline mode](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) with different ZeRO-Inference offloading options. Each script runs a inference on a different model with a prompt of 512 tokens, output of 32 tokens, and batch size of 128. 12 | 2. DeepSpeed config files corresponding to ZeRO-Inference offloading: (i) CPU offload, (ii) NVMe offload with AIO, and (iii) NVMe offloading with NVIDIA GDS. -------------------------------------------------------------------------------- /inference/sglang/ds_offload_cpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "stage3_prefetch_bucket_size": "auto", 5 | "stage3_param_persistence_threshold": "auto", 6 | "stage3_max_live_parameters": "auto", 7 | "offload_param": { 8 | "device": "cpu", 9 | "pin_memory": true, 10 | "buffer_size": "auto" 11 | } 12 | }, 13 | "train_batch_size": 1 14 | } 15 | -------------------------------------------------------------------------------- /inference/sglang/ds_offload_nvme_aio.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "stage3_prefetch_bucket_size": "auto", 5 | "stage3_param_persistence_threshold": "auto", 6 | "stage3_max_live_parameters": "auto", 7 | "offload_param": { 8 | "device": "nvme", 9 | "nvme_path": "/local_nvme/sglang", 10 | "pin_memory": true, 11 | "buffer_size": "auto", 12 | "buffer_count": 5 13 | } 14 | }, 15 | "aio": { 16 | "block_size": 8388608, 17 | "queue_depth": 32, 18 | "intra_op_parallelism": 8, 19 | "single_submit": false, 20 | "overlap_events": true, 21 | "use_gds": false 22 | }, 23 | "train_batch_size": 1 24 | } 25 | -------------------------------------------------------------------------------- /inference/sglang/ds_offload_nvme_gds.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "stage3_prefetch_bucket_size": "auto", 5 | "stage3_param_persistence_threshold": "auto", 6 | "stage3_max_live_parameters": "auto", 7 | "offload_param": { 8 | "device": "nvme", 9 | "nvme_path": "/local_nvme/sglang", 10 | "pin_memory": true, 11 | "buffer_size": "auto", 12 | "buffer_count": 3 13 | } 14 | }, 15 | "aio": { 16 | "block_size": 8388608, 17 | "queue_depth": 32, 18 | "intra_op_parallelism": 8, 19 | "single_submit": false, 20 | "overlap_events": true, 21 | "use_gds": true 22 | }, 23 | "train_batch_size": 1 24 | } 25 | -------------------------------------------------------------------------------- /inference/sglang/run_llama3_1B.sh: -------------------------------------------------------------------------------- 1 | export LOCAL_RANK=0 2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" 3 | BATCH_SIZE=128 4 | MODEL_NAME="meta-llama/Llama-3.2-1B" 5 | 6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json 7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json 8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json 9 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph 10 | 11 | 12 | -------------------------------------------------------------------------------- /inference/sglang/run_llama3_70B.sh: -------------------------------------------------------------------------------- 1 | export LOCAL_RANK=0 2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" 3 | BATCH_SIZE=128 4 | MODEL_NAME="meta-llama/Meta-Llama-3.1-70B" 5 | 6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json 7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json 8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json 9 | # python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph 10 | -------------------------------------------------------------------------------- /inference/sglang/run_llama3_8B.sh: -------------------------------------------------------------------------------- 1 | export LOCAL_RANK=0 2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" 3 | BATCH_SIZE=128 4 | MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct" 5 | 6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json 7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json 8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json 9 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph 10 | -------------------------------------------------------------------------------- /scripts/check-license.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | from __future__ import annotations 8 | """ 9 | Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py 10 | """ 11 | 12 | import subprocess 13 | import sys 14 | 15 | 16 | def err(s: str) -> None: 17 | print(s, file=sys.stderr) 18 | 19 | 20 | COPYRIGHT = [ 21 | r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$", 22 | r"^\(\/\/\|#\) DeepSpeed Team$" 23 | ] 24 | 25 | success = True 26 | failures = [] 27 | for f in sys.argv[1:]: 28 | for copyright_line in COPYRIGHT: 29 | if not success: 30 | continue 31 | res = subprocess.run(["git", "grep", "--quiet", "-e", copyright_line, f], capture_output=True) 32 | if res.returncode == 1: 33 | success = False 34 | failures.append(f) 35 | elif res.returncode == 2: 36 | err(f"Error invoking grep on {', '.join(sys.argv[1:])}:") 37 | err(res.stderr.decode("utf-8")) 38 | sys.exit(2) 39 | 40 | if not success: 41 | err(f'{failures}: Missing license at top of file') 42 | err(res.stdout.decode("utf-8")) 43 | sys.exit(1) 44 | -------------------------------------------------------------------------------- /training/BingBertGlue/glue_bert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32, 3 | "train_micro_batch_size_per_gpu": 1, 4 | "steps_per_print": 10, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 2e-5, 9 | "weight_decay": 0.0, 10 | "bias_correction": true 11 | } 12 | }, 13 | "gradient_clipping": 1.0, 14 | "fp16": { 15 | "enabled": false 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /training/BingBertGlue/glue_bert_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32, 3 | "train_micro_batch_size_per_gpu": 1, 4 | "steps_per_print": 10, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 2e-5, 9 | "weight_decay": 0.0, 10 | "bias_correction": true 11 | } 12 | }, 13 | "gradient_clipping": 1.0, 14 | "fp16": { 15 | "enabled": false 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /training/BingBertGlue/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.0" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 4 | BertForMaskedLM, BertForNextSentencePrediction, 5 | BertForSequenceClassification, BertForMultipleChoice, 6 | BertForTokenClassification, BertForQuestionAnswering) 7 | from .optimization import BertAdam 8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE 9 | -------------------------------------------------------------------------------- /training/BingBertGlue/pytorch_pretrained_bert/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | try: 5 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 6 | except ModuleNotFoundError: 7 | print( 8 | "pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 9 | "In that case, it requires TensorFlow to be installed. Please see " 10 | "https://www.tensorflow.org/install/ for installation instructions." 11 | ) 12 | raise 13 | 14 | if len(sys.argv) != 5: 15 | # pylint: disable=line-too-long 16 | print( 17 | "Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`" 18 | ) 19 | else: 20 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 21 | TF_CONFIG = sys.argv.pop() 22 | TF_CHECKPOINT = sys.argv.pop() 23 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, 24 | PYTORCH_DUMP_OUTPUT) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /training/BingBertGlue/turing/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch.distributed as dist 3 | 4 | logging.basicConfig( 5 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 6 | datefmt='%m/%d/%Y %H:%M:%S', 7 | level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class Logger(): 12 | def __init__(self, cuda=False): 13 | self.logger = logging.getLogger(__name__) 14 | self.cuda = cuda 15 | 16 | def info(self, message, *args, **kwargs): 17 | if (self.cuda and dist.get_rank() == 0) or not self.cuda: 18 | self.logger.info(message, *args, **kwargs) 19 | 20 | def error(self, message, *args, **kwargs): 21 | self.logger.error(message, *args, **kwargs) 22 | -------------------------------------------------------------------------------- /training/BingBertGlue/turing/text.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | PAD = 0 4 | 5 | 6 | def mask(x): 7 | return x != PAD 8 | 9 | 10 | def torch_long(x): 11 | return torch.LongTensor(x) 12 | -------------------------------------------------------------------------------- /training/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 96, 3 | "train_micro_batch_size_per_gpu": 3, 4 | "steps_per_print": 100, 5 | "optimizer": { 6 | "type": "OnebitAdam", 7 | "params": { 8 | "lr": 3e-5, 9 | "freeze_step": 400, 10 | "weight_decay": 0.0, 11 | "bias_correction": false, 12 | "cuda_aware": false, 13 | "comm_backend_name": "mpi" 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /training/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 96, 3 | "train_micro_batch_size_per_gpu": 3, 4 | "steps_per_print": 100, 5 | "optimizer": { 6 | "type": "OnebitAdam", 7 | "params": { 8 | "lr": 3e-5, 9 | "freeze_step": 400, 10 | "weight_decay": 0.0, 11 | "bias_correction": false, 12 | "cuda_aware": true, 13 | "comm_backend_name": "mpi" 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /training/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 96, 3 | "train_micro_batch_size_per_gpu": 3, 4 | "steps_per_print": 100, 5 | "optimizer": { 6 | "type": "OnebitAdam", 7 | "params": { 8 | "lr": 3e-5, 9 | "freeze_step": 400, 10 | "weight_decay": 0.0, 11 | "bias_correction": false, 12 | "cuda_aware": false, 13 | "comm_backend_name": "nccl" 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /training/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 1024, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 4096, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 16, 15 | "num_hidden_layers": 24, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522 19 | } 20 | -------------------------------------------------------------------------------- /training/BingBertSquad/deepspeed_bsz24_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 24, 3 | "train_micro_batch_size_per_gpu": 3, 4 | "steps_per_print": 10, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 3e-5, 9 | "weight_decay": 0.0, 10 | "bias_correction": false 11 | } 12 | }, 13 | "gradient_clipping": 1.0, 14 | "fp16": { 15 | "enabled": true 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /training/BingBertSquad/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import evaluate as eval 4 | 5 | if __name__ == '__main__': 6 | expected_version = '1.1' 7 | parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' + 8 | expected_version) 9 | parser.add_argument('dataset_file', help='Dataset file') 10 | parser.add_argument('prediction_file', help='Prediction File') 11 | args = parser.parse_args() 12 | 13 | print( 14 | json.dumps( 15 | eval.evaluate(expected_version, args.dataset_file, 16 | args.prediction_file))) 17 | -------------------------------------------------------------------------------- /training/BingBertSquad/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.0" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 4 | BertForMaskedLM, BertForNextSentencePrediction, 5 | BertForSequenceClassification, BertForMultipleChoice, 6 | BertForTokenClassification, BertForQuestionAnswering) 7 | from .optimization import BertAdam 8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE 9 | -------------------------------------------------------------------------------- /training/DeepSpeed-Domino/requirements.txt: -------------------------------------------------------------------------------- 1 | apex 2 | deepspeed>=0.16.6 3 | nltk 4 | pybind11 5 | transformers 6 | regex 7 | -------------------------------------------------------------------------------- /training/HelloDeepSpeed/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==1.13.3 2 | transformers==4.5.1 3 | fire==0.4.0 4 | pytz==2021.1 5 | loguru==0.5.3 6 | sh==1.14.2 7 | pytest==6.2.5 8 | tqdm==4.62.3 -------------------------------------------------------------------------------- /training/HelloDeepSpeed/run.sh: -------------------------------------------------------------------------------- 1 | python train_bert.py --checkpoint_dir ./experiment 2 | -------------------------------------------------------------------------------- /training/HelloDeepSpeed/run_ds.sh: -------------------------------------------------------------------------------- 1 | deepspeed --bind_cores_to_rank train_bert_ds.py --checkpoint_dir experiment_deepspeed $@ 2 | -------------------------------------------------------------------------------- /training/HelloDeepSpeed/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/HelloDeepSpeed/tests/__init__.py -------------------------------------------------------------------------------- /training/MoQ/README.md: -------------------------------------------------------------------------------- 1 | # Not maintained / deprecated 2 | 3 | > __Warning__ 4 | > This folder/feature has been deprecated. Feel free to test and submit an issue if you run into errors. 5 | 6 | -------------------------------------------------------------------------------- /training/MoQ/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | -------------------------------------------------------------------------------- /training/MoQ/run.sh: -------------------------------------------------------------------------------- 1 | OOO=output 2 | MASTER_PORT=12345 3 | GPU=0 4 | 5 | for TSK in qnli #stsb mrpc cola wnli sst2 rte qnli qqp mnli 6 | do 7 | 8 | if [ $TSK == wnli ] || [ $TSK == mrpc ] 9 | then 10 | EPOCH_NUM=5 11 | else 12 | EPOCH_NUM=3 13 | fi 14 | 15 | if [ $TSK == qqp ] || [ $TSK == mnli ] 16 | then 17 | TEST_JSON=test_long.json 18 | else 19 | TEST_JSON=test.json 20 | fi 21 | 22 | PORT=$((MASTER_PORT+GPU)) 23 | 24 | rm -rvf ./$OOO/${TSK} 25 | 26 | CUDA_VISIBLE_DEVICES=$GPU python -m torch.distributed.launch \ 27 | --master_port $PORT \ 28 | --nproc_per_node 1 run_glue.py \ 29 | --model_name_or_path bert-base-cased \ 30 | --task_name $TSK \ 31 | --do_train \ 32 | --do_eval \ 33 | --max_seq_length 128 \ 34 | --per_device_train_batch_size 32 \ 35 | --learning_rate 2e-5 \ 36 | --num_train_epochs $EPOCH_NUM \ 37 | --output_dir ./$OOO/$TSK/ \ 38 | --fp16 \ 39 | --warmup_steps 2 \ 40 | --deepspeed test.json 41 | 42 | done 43 | -------------------------------------------------------------------------------- /training/MoQ/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "steps_per_print": 10, 3 | "gradient_clipping": 1.0, 4 | "fp16": { 5 | "initial_scale_power": 16, 6 | "enabled": true 7 | }, 8 | "quantize_training": { 9 | "enabled": true, 10 | "quantize_verbose": true, 11 | "quantizer_kernel": true, 12 | "quantize_algo": { 13 | "q_type": "symmetric" 14 | }, 15 | "quantize_bits": { 16 | "start_bits": 16, 17 | "target_bits": 8 18 | }, 19 | "quantize_schedule": { 20 | "quantize_period": 400, 21 | "schedule_offset": 0 22 | }, 23 | "quantize_groups": 8 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /training/autotuning/.gitignore: -------------------------------------------------------------------------------- 1 | autotuning_results* 2 | autotuning_exps* 3 | output* 4 | mnli 5 | -------------------------------------------------------------------------------- /training/autotuning/README.md: -------------------------------------------------------------------------------- 1 | # Autotuning Examples 2 | 3 | This showcases the [autotuning](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS). 4 | -------------------------------------------------------------------------------- /training/autotuning/hf/bert-base/ds_config_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "autotuning": { 4 | "enabled": true, 5 | "overwrite": false, 6 | "max_train_batch_size": 4096, 7 | "arg_mappings": { 8 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", 9 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /training/autotuning/hf/bert-large/ds_config_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "autotuning": { 4 | "enabled": true, 5 | "overwrite": false, 6 | "arg_mappings": { 7 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", 8 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /training/autotuning/hf/deberta/ds_config_fp16_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "fp16": { 4 | "enabled": true, 5 | "initial_scale_power": 12 6 | }, 7 | "autotuning": { 8 | "enabled": true, 9 | "overwrite": false, 10 | "fast": true, 11 | "arg_mappings": { 12 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", 13 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /training/autotuning/hf/distilbert/ds_config_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "autotuning": { 4 | "enabled": true, 5 | "overwrite": false, 6 | "max_train_batch_size": 4096, 7 | "arg_mappings": { 8 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", 9 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_fp16_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "fp16": { 4 | "enabled": true 5 | }, 6 | "autotuning": { 7 | "enabled": true, 8 | "overwrite": false, 9 | "fast": true, 10 | "arg_mappings": { 11 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", 12 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_fp16_z0.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 0 5 | }, 6 | "fp16": { 7 | "enabled": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_fp16_z1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 1 5 | }, 6 | "fp16": { 7 | "enabled": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_fp16_z2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 2 5 | }, 6 | "fp16": { 7 | "enabled": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_fp16_z3.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 3 5 | }, 6 | "fp16": { 7 | "enabled": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "autotuning": { 4 | "enabled": true, 5 | "overwrite": false, 6 | "fast": true, 7 | "arg_mappings": { 8 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", 9 | "gradient_accumulation_steps ": "--gradient_accumulation_steps" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_z0.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 0 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_z1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_z2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 2 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /training/autotuning/hf/dsconfigs/ds_config_z3.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_optimization": { 4 | "stage": 3 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /training/bing_bert/01_adam/mpi_ethernet/deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "ZeroOneAdam", 8 | "params": { 9 | "lr": 4e-4, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "var_freeze_step": 12500, 13 | "local_step_scaler": 32678, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "gradient_clipping": 1.0, 19 | 20 | "wall_clock_breakdown": false, 21 | 22 | "fp16": { 23 | "enabled": true, 24 | "loss_scale": 0, 25 | "initial_scale_power": 16 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /training/bing_bert/01_adam/mpi_ethernet/deepspeed_bsz4k_01adam_config_seq512_mpi_ethernet.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "ZeroOneAdam", 8 | "params": { 9 | "lr": 2.82e-5, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "var_freeze_step": 155000, 13 | "local_step_scaler": 32678, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "gradient_clipping": 1.0, 19 | 20 | "wall_clock_breakdown": false, 21 | 22 | "fp16": { 23 | "enabled": true, 24 | "loss_scale": 0, 25 | "initial_scale_power": 16 26 | } 27 | } -------------------------------------------------------------------------------- /training/bing_bert/01_adam/mpi_ethernet/ds_train_bert_01adam_bsz4k_seq128_mpi_ethernet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script requires pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs). 5 | # Read the tutorial for more details: 6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/ 7 | 8 | base_dir=`pwd` 9 | 10 | JOB_NAME=01adam_bsz4k_seq128_mpi_ethernet 11 | OUTPUT_DIR=${base_dir}/bert_model_outputs 12 | 13 | mkdir -p $OUTPUT_DIR 14 | 15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed. 16 | run_cmd="NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed --launcher=openmpi \ 17 | ${base_dir}/../../deepspeed_train.py \ 18 | --cf ${base_dir}/../../bert_large.json \ 19 | --max_seq_length 128 \ 20 | --output_dir $OUTPUT_DIR \ 21 | --deepspeed \ 22 | --print_steps 40 \ 23 | --lr_schedule "LE" \ 24 | --lr_offset 0.0 \ 25 | --job_name $JOB_NAME \ 26 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json \ 27 | --data_path_prefix /data/bert \ 28 | &> ${JOB_NAME}.log" 29 | 30 | echo ${run_cmd} 31 | eval ${run_cmd} -------------------------------------------------------------------------------- /training/bing_bert/01_adam/mpi_infiniband/deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "ZeroOneAdam", 8 | "params": { 9 | "lr": 4e-4, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "var_freeze_step": 12500, 13 | "local_step_scaler": 32678, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "gradient_clipping": 1.0, 19 | 20 | "wall_clock_breakdown": false, 21 | 22 | "fp16": { 23 | "enabled": true, 24 | "loss_scale": 0, 25 | "initial_scale_power": 16 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /training/bing_bert/01_adam/mpi_infiniband/deepspeed_bsz4k_01adam_config_seq512_mpi_infiniband.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "ZeroOneAdam", 8 | "params": { 9 | "lr": 2.82e-5, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "var_freeze_step": 155000, 13 | "local_step_scaler": 32678, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "gradient_clipping": 1.0, 19 | 20 | "wall_clock_breakdown": false, 21 | 22 | "fp16": { 23 | "enabled": true, 24 | "loss_scale": 0, 25 | "initial_scale_power": 16 26 | } 27 | } -------------------------------------------------------------------------------- /training/bing_bert/01_adam/mpi_infiniband/ds_train_bert_01adam_bsz4k_seq128_mpi_infiniband.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script requires pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs). 5 | # Read the tutorial for more details: 6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/ 7 | 8 | base_dir=`pwd` 9 | 10 | JOB_NAME=01adam_bsz4k_seq128_mpi_infiniband 11 | OUTPUT_DIR=${base_dir}/bert_model_outputs 12 | 13 | mkdir -p $OUTPUT_DIR 14 | 15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed. 16 | run_cmd="NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \ 17 | --cf ${base_dir}/../../bert_large.json \ 18 | --max_seq_length 128 \ 19 | --output_dir $OUTPUT_DIR \ 20 | --deepspeed \ 21 | --print_steps 40 \ 22 | --lr_schedule "LE" \ 23 | --lr_offset 0.0 \ 24 | --job_name $JOB_NAME \ 25 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json \ 26 | --data_path_prefix /data/bert \ 27 | &> ${JOB_NAME}.log" 28 | 29 | echo ${run_cmd} 30 | eval ${run_cmd} -------------------------------------------------------------------------------- /training/bing_bert/01_adam/nccl/deepspeed_bsz4k_01adam_config_seq128_nccl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "ZeroOneAdam", 8 | "params": { 9 | "lr": 4e-4, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "var_freeze_step": 12500, 13 | "local_step_scaler": 32678, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "gradient_clipping": 1.0, 19 | 20 | "wall_clock_breakdown": false, 21 | 22 | "fp16": { 23 | "enabled": true, 24 | "loss_scale": 0, 25 | "initial_scale_power": 16 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /training/bing_bert/01_adam/nccl/deepspeed_bsz4k_01adam_config_seq512_nccl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "ZeroOneAdam", 8 | "params": { 9 | "lr": 2.82e-5, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "var_freeze_step": 155000, 13 | "local_step_scaler": 32678, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "gradient_clipping": 1.0, 19 | 20 | "wall_clock_breakdown": false, 21 | 22 | "fp16": { 23 | "enabled": true, 24 | "loss_scale": 0, 25 | "initial_scale_power": 16 26 | } 27 | } -------------------------------------------------------------------------------- /training/bing_bert/01_adam/nccl/ds_train_bert_01adam_bsz4k_seq128_nccl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script requires pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs). 5 | # Read the tutorial for more details: 6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/ 7 | 8 | base_dir=`pwd` 9 | 10 | JOB_NAME=01adam_bsz4k_seq128_nccl 11 | OUTPUT_DIR=${base_dir}/bert_model_outputs 12 | 13 | mkdir -p $OUTPUT_DIR 14 | 15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed. 16 | run_cmd="NCCL_TREE_THRESHOLD=0 NCCL_DEBUG=INFO \ 17 | deepspeed \ 18 | ${base_dir}/../../deepspeed_train.py \ 19 | --cf ${base_dir}/../../bert_large.json \ 20 | --max_seq_length 128 \ 21 | --output_dir $OUTPUT_DIR \ 22 | --deepspeed \ 23 | --print_steps 40 \ 24 | --lr_schedule "LE" \ 25 | --lr_offset 0.0 \ 26 | --job_name $JOB_NAME \ 27 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_nccl.json \ 28 | --data_path_prefix /data/bert \ 29 | &> ${JOB_NAME}.log" 30 | 31 | echo ${run_cmd} 32 | eval ${run_cmd} -------------------------------------------------------------------------------- /training/bing_bert/1-bit_adam/mpi_ethernet/deepspeed_bsz4k_onebitadam_config_seq128_mpi_ethernet.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitAdam", 8 | "params": { 9 | "lr": 4e-4, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "freeze_step": 23000, 13 | "cuda_aware": false, 14 | "comm_backend_name": "mpi" 15 | } 16 | }, 17 | "gradient_clipping": 1.0, 18 | 19 | "wall_clock_breakdown": false, 20 | 21 | "fp16": { 22 | "enabled": true, 23 | "loss_scale": 0, 24 | "initial_scale_power": 16 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_adam/mpi_infiniband/deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitAdam", 8 | "params": { 9 | "lr": 4e-4, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "freeze_step": 23000, 13 | "cuda_aware": true, 14 | "comm_backend_name": "mpi" 15 | } 16 | }, 17 | "gradient_clipping": 1.0, 18 | 19 | "wall_clock_breakdown": false, 20 | 21 | "fp16": { 22 | "enabled": true, 23 | "loss_scale": 0, 24 | "initial_scale_power": 16 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_adam/mpi_infiniband/ds_train_bert_onebitadam_bsz4k_seq128_mpi_infiniband.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you are able to install pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs), 5 | # we highly recommend you to use the NCCL-based 1-bit Adam 6 | # which has better performance and ease of use 7 | # (see scripts in DeepSpeedExamples/bing_bert/1-bit_adam/nccl 8 | # and read the tutorial for more details: 9 | # https://www.deepspeed.ai/tutorials/onebit-adam/) 10 | 11 | base_dir=`pwd` 12 | 13 | # Where should we save checkpoints and tensorboard events? 14 | JOB_NAME=onebit_adam_4k_seq128_mpi_infiniband 15 | OUTPUT_DIR=${base_dir}/bert_model_outputs 16 | 17 | mkdir -p $OUTPUT_DIR 18 | 19 | NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \ 20 | --cf ${base_dir}/../../bert_large.json \ 21 | --max_seq_length 128 \ 22 | --output_dir $OUTPUT_DIR \ 23 | --deepspeed_mpi \ 24 | --deepspeed \ 25 | --deepspeed_transformer_kernel \ 26 | --print_steps 40 \ 27 | --lr_schedule "LE" \ 28 | --lr_offset 0.0 \ 29 | --job_name $JOB_NAME \ 30 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json \ 31 | --data_path_prefix /data/bert \ 32 | &> ${JOB_NAME}.log 33 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_adam/nccl/deepspeed_bsz4k_onebitadam_config_seq128_nccl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitAdam", 8 | "params": { 9 | "lr": 4e-4, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "freeze_step": 23000, 13 | "cuda_aware": false, 14 | "comm_backend_name": "nccl" 15 | } 16 | }, 17 | "gradient_clipping": 1.0, 18 | 19 | "wall_clock_breakdown": false, 20 | 21 | "fp16": { 22 | "enabled": true, 23 | "loss_scale": 0, 24 | "initial_scale_power": 16 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_adam/nccl/ds_train_bert_onebitadam_bsz4k_seq128_nccl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script requires pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs). 5 | # Read the tutorial for more details: 6 | # https://www.deepspeed.ai/tutorials/onebit-adam/ 7 | 8 | base_dir=`pwd` 9 | 10 | # Where should we save checkpoints and tensorboard events? 11 | JOB_NAME=onebit_adam_4k_seq128_nccl 12 | OUTPUT_DIR=${base_dir}/bert_model_outputs 13 | 14 | mkdir -p $OUTPUT_DIR 15 | 16 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed. 17 | NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ${base_dir}/../../deepspeed_train.py \ 18 | --cf ${base_dir}/../../bert_large.json \ 19 | --max_seq_length 128 \ 20 | --output_dir $OUTPUT_DIR \ 21 | --deepspeed \ 22 | --deepspeed_transformer_kernel \ 23 | --print_steps 40 \ 24 | --lr_schedule "LE" \ 25 | --lr_offset 0.0 \ 26 | --job_name $JOB_NAME \ 27 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_onebitadam_config_seq128_nccl.json \ 28 | --data_path_prefix /data/bert \ 29 | &> ${JOB_NAME}.log 30 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/mpi_ethernet/deepspeed_bsz32k_onebitlamb_config_seq512_mpi_ethernet.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32768, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitLamb", 8 | "params": { 9 | "lr": 2e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01, 14 | "freeze_step": 6100, 15 | "cuda_aware": false, 16 | "comm_backend_name": "mpi", 17 | "coeff_beta": 0.9, 18 | "factor_max": 4.0, 19 | "factor_min": 0.5, 20 | "factor_threshold": 0.1 21 | } 22 | }, 23 | "gradient_clipping": 1.0, 24 | 25 | "wall_clock_breakdown": false, 26 | 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/mpi_ethernet/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_ethernet.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 65536, 3 | "train_micro_batch_size_per_gpu": 64, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitLamb", 8 | "params": { 9 | "lr": 11e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01, 14 | "freeze_step": 1000, 15 | "cuda_aware": false, 16 | "comm_backend_name": "mpi", 17 | "coeff_beta": 0.9, 18 | "factor_max": 4.0, 19 | "factor_min": 0.5, 20 | "factor_threshold": 0.1 21 | } 22 | }, 23 | "gradient_clipping": 1.0, 24 | 25 | "wall_clock_breakdown": false, 26 | 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0, 30 | "initial_scale_power": 16 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/mpi_infiniband/deepspeed_bsz32k_onebitlamb_config_seq512_mpi_infiniband.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32768, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitLamb", 8 | "params": { 9 | "lr": 2e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01, 14 | "freeze_step": 6100, 15 | "cuda_aware": true, 16 | "comm_backend_name": "mpi", 17 | "coeff_beta": 0.9, 18 | "factor_max": 4.0, 19 | "factor_min": 0.5, 20 | "factor_threshold": 0.1 21 | } 22 | }, 23 | "gradient_clipping": 1.0, 24 | 25 | "wall_clock_breakdown": false, 26 | 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/mpi_infiniband/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 65536, 3 | "train_micro_batch_size_per_gpu": 64, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitLamb", 8 | "params": { 9 | "lr": 11e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01, 14 | "freeze_step": 1000, 15 | "cuda_aware": true, 16 | "comm_backend_name": "mpi", 17 | "coeff_beta": 0.9, 18 | "factor_max": 4.0, 19 | "factor_min": 0.5, 20 | "factor_threshold": 0.1 21 | } 22 | }, 23 | "gradient_clipping": 1.0, 24 | 25 | "wall_clock_breakdown": false, 26 | 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0, 30 | "initial_scale_power": 16 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/mpi_infiniband/ds_train_bert_onebitlamb_bsz64k_seq128_mpi_infiniband.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you are able to install pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs), 5 | # we highly recommend you to use the NCCL-based 1-bit Lamb 6 | # which has better performance and ease of use 7 | # (see scripts in DeepSpeedExamples/bing_bert/1-bit_lamb/nccl 8 | # and read the tutorial for more details: 9 | # https://www.deepspeed.ai/tutorials/onebit-lamb/) 10 | 11 | base_dir=`pwd` 12 | 13 | # Where should we save checkpoints and tensorboard events? 14 | JOB_NAME=onebit_lamb_64k_seq128_mpi_infiniband 15 | OUTPUT_DIR=${base_dir}/bert_model_outputs 16 | 17 | mkdir -p $OUTPUT_DIR 18 | 19 | NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \ 20 | --cf ${base_dir}/../../bert_large_lamb.json \ 21 | --max_seq_length 128 \ 22 | --output_dir $OUTPUT_DIR \ 23 | --deepspeed_mpi \ 24 | --deepspeed \ 25 | --deepspeed_transformer_kernel \ 26 | --print_steps 40 \ 27 | --lr_schedule "EE" \ 28 | --lr_offset 10e-4 \ 29 | --job_name $JOB_NAME \ 30 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json \ 31 | --data_path_prefix /data/bert \ 32 | &> ${JOB_NAME}.log 33 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/nccl/deepspeed_bsz32k_onebitlamb_config_seq512_nccl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32768, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitLamb", 8 | "params": { 9 | "lr": 2e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01, 14 | "freeze_step": 6100, 15 | "cuda_aware": false, 16 | "comm_backend_name": "nccl", 17 | "coeff_beta": 0.9, 18 | "factor_max": 4.0, 19 | "factor_min": 0.5, 20 | "factor_threshold": 0.1 21 | } 22 | }, 23 | "gradient_clipping": 1.0, 24 | 25 | "wall_clock_breakdown": false, 26 | 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/nccl/deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 65536, 3 | "train_micro_batch_size_per_gpu": 64, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "OneBitLamb", 8 | "params": { 9 | "lr": 11e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01, 14 | "freeze_step": 1000, 15 | "cuda_aware": false, 16 | "comm_backend_name": "nccl", 17 | "coeff_beta": 0.9, 18 | "factor_max": 4.0, 19 | "factor_min": 0.5, 20 | "factor_threshold": 0.1 21 | } 22 | }, 23 | "gradient_clipping": 1.0, 24 | 25 | "wall_clock_breakdown": false, 26 | 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0, 30 | "initial_scale_power": 16 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /training/bing_bert/1-bit_lamb/nccl/ds_train_bert_onebitlamb_bsz64k_seq128_nccl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script requires pytorch >= 1.8 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs). 5 | # Read the tutorial for more details: 6 | # https://www.deepspeed.ai/tutorials/onebit-lamb 7 | 8 | base_dir=`pwd` 9 | 10 | # Where should we save checkpoints and tensorboard events? 11 | JOB_NAME=onebit_lamb_64k_seq128_nccl 12 | OUTPUT_DIR=${base_dir}/bert_model_outputs 13 | 14 | mkdir -p $OUTPUT_DIR 15 | 16 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed. 17 | NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ${base_dir}/../../deepspeed_train.py \ 18 | --cf ${base_dir}/../../bert_large_lamb.json \ 19 | --max_seq_length 128 \ 20 | --output_dir $OUTPUT_DIR \ 21 | --deepspeed \ 22 | --deepspeed_transformer_kernel \ 23 | --print_steps 40 \ 24 | --lr_schedule "EE" \ 25 | --lr_offset 10e-4 \ 26 | --job_name $JOB_NAME \ 27 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json \ 28 | --data_path_prefix /data/bert \ 29 | --ckpt_to_save 150 \ 30 | &> ${JOB_NAME}.log 31 | -------------------------------------------------------------------------------- /training/bing_bert/bert_dataset_provider.py: -------------------------------------------------------------------------------- 1 | class BertDatasetProviderInterface: 2 | def get_shard(self, index, shuffle=True): 3 | raise NotImplementedError 4 | 5 | def release_shard(self, index): 6 | raise NotImplementedError 7 | 8 | def prefetch_shard(self, index): 9 | raise NotImplementedError 10 | 11 | def get_batch(self, batch_iter): 12 | raise NotImplementedError 13 | 14 | def prefetch_batch(self): 15 | raise NotImplementedError 16 | -------------------------------------------------------------------------------- /training/bing_bert/data_worker.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import queue 3 | import time 4 | 5 | 6 | class AsyncWorker(threading.Thread): 7 | def __init__(self, dataloaders, dataset_picker): 8 | threading.Thread.__init__(self) 9 | self.req_queue = queue.Queue() 10 | self.ret_queue = queue.Queue() 11 | self.dataloaders = dataloaders 12 | self.dataset_picker = dataset_picker 13 | self.prefetch_idx = 3 14 | for i in range(self.prefetch_idx): 15 | self.req_queue.put(dataset_picker[i]) 16 | 17 | def run(self): 18 | while True: 19 | dataset_type = self.req_queue.get(block=True) 20 | if dataset_type is None: 21 | break 22 | batch = next(self.dataloaders[dataset_type]) 23 | self.req_queue.task_done() 24 | self.ret_queue.put(batch) 25 | 26 | def get(self): 27 | batch = self.ret_queue.get() 28 | self.ret_queue.task_done() 29 | return batch 30 | 31 | def prefetch(self): 32 | if self.prefetch_idx < len(self.dataset_picker): 33 | self.req_queue.put(self.dataset_picker[self.prefetch_idx]) 34 | self.prefetch_idx += 1 35 | 36 | def stop(self): 37 | self.req_queue.put(None) 38 | -------------------------------------------------------------------------------- /training/bing_bert/deepspeed_bsz32k_lamb_config_seq512.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32768, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "Lamb", 8 | "params": { 9 | "lr": 2e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | 18 | "wall_clock_breakdown": false, 19 | 20 | "fp16": { 21 | "enabled": true, 22 | "loss_scale": 0 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /training/bing_bert/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 4096, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": true, 6 | "gradient_predivide_factor": 8, 7 | "optimizer": { 8 | "type": "Adam", 9 | "params": { 10 | "lr": 1e-3, 11 | "weight_decay": 0.01, 12 | "bias_correction": false 13 | } 14 | }, 15 | "gradient_clipping": 1.0, 16 | "wall_clock_breakdown": false, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0 20 | }, 21 | "progressive_layer_drop": { 22 | "enabled": true, 23 | "theta": 0.5, 24 | "gamma": 0.001 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /training/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 65536, 3 | "train_micro_batch_size_per_gpu": 64, 4 | "steps_per_print": 1000, 5 | "prescale_gradients": false, 6 | "optimizer": { 7 | "type": "Lamb", 8 | "params": { 9 | "lr": 11e-3, 10 | "weight_decay": 0.01, 11 | "bias_correction": false, 12 | "max_coeff": 0.3, 13 | "min_coeff": 0.01 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | 18 | "wall_clock_breakdown": false, 19 | 20 | "fp16": { 21 | "enabled": true, 22 | "loss_scale": 0 23 | }, 24 | "sparse_attention": { 25 | "mode": "fixed", 26 | "block": 16, 27 | "different_layout_per_head": true, 28 | "num_local_blocks": 4, 29 | "num_global_blocks": 1, 30 | "attention": "bidirectional", 31 | "horizontal_global_attention": false, 32 | "num_different_global_patterns": 4 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /training/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script runs deepspeed using sparse attention for BertEncoderLayer. 4 | 5 | base_dir=`pwd` 6 | 7 | # Where should we save checkpoints and tensorboard events? 8 | JOB_NAME=lamb_64k_seq128 9 | OUTPUT_DIR=${base_dir}/bert_model_outputs 10 | 11 | mkdir -p $OUTPUT_DIR 12 | 13 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \ 14 | --cf ${base_dir}/bert_large_lamb.json \ 15 | --max_seq_length 128 \ 16 | --output_dir $OUTPUT_DIR \ 17 | --deepspeed \ 18 | --deepspeed_sparse_attention \ 19 | --print_steps 100 \ 20 | --lr_schedule "EE" \ 21 | --lr_offset 10e-4 \ 22 | --job_name $JOB_NAME \ 23 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \ 24 | --data_path_prefix /data/bert \ 25 | &> ${JOB_NAME}.log 26 | -------------------------------------------------------------------------------- /training/bing_bert/ds_train_bert_bsz32k_seq512.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | base_dir=`pwd` 4 | 5 | # Where should we save checkpoints and tensorboard events? 6 | JOB_NAME=lamb_32k_chkpt150_seq512 7 | OUTPUT_DIR=${base_dir}/bert_model_outputs 8 | 9 | # Assumes job name in previous seq128 run, will resume training from epoch 150 10 | CHECKPOINT_BASE_PATH=${OUTPUT_DIR}/saved_models/lamb_64k_seq128 11 | CHECKPOINT_EPOCH150_NAME=`basename ${CHECKPOINT_BASE_PATH}/epoch150_*` 12 | echo "checkpoint id: $CHECKPOINT_EPOCH150_NAME" 13 | 14 | mkdir -p $OUTPUT_DIR 15 | 16 | deepspeed ${base_dir}/deepspeed_train.py \ 17 | --cf ${base_dir}/bert_large_lamb.json \ 18 | --max_seq_length 512 \ 19 | --output_dir $OUTPUT_DIR \ 20 | --print_steps 100 \ 21 | --deepspeed \ 22 | --deepspeed_transformer_kernel \ 23 | --job_name $JOB_NAME \ 24 | --deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \ 25 | --data_path_prefix /data/bert \ 26 | --validation_data_path_prefix /data/bert \ 27 | --rewarmup \ 28 | --lr_schedule "EE" \ 29 | --attention_dropout_checkpoint \ 30 | --lr_offset 0.0 \ 31 | --load_training_checkpoint ${CHECKPOINT_BASE_PATH} \ 32 | --load_checkpoint_id ${CHECKPOINT_EPOCH150_NAME} \ 33 | &> ${JOB_NAME}.log 34 | -------------------------------------------------------------------------------- /training/bing_bert/ds_train_bert_bsz64k_seq128.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | base_dir=`pwd` 4 | 5 | # Where should we save checkpoints and tensorboard events? 6 | JOB_NAME=lamb_64k_seq128 7 | OUTPUT_DIR=${base_dir}/bert_model_outputs 8 | 9 | mkdir -p $OUTPUT_DIR 10 | 11 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \ 12 | --cf ${base_dir}/bert_large_lamb.json \ 13 | --max_seq_length 128 \ 14 | --output_dir $OUTPUT_DIR \ 15 | --deepspeed \ 16 | --deepspeed_transformer_kernel \ 17 | --print_steps 100 \ 18 | --lr_schedule "EE" \ 19 | --lr_offset 10e-4 \ 20 | --job_name $JOB_NAME \ 21 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \ 22 | --data_path_prefix /data/bert \ 23 | &> ${JOB_NAME}.log 24 | -------------------------------------------------------------------------------- /training/bing_bert/ds_train_bert_nvidia_data_bsz32k_seq512.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ -z $1 ]]; then 4 | LOAD_EPOCH=16 5 | else 6 | LOAD_EPOCH=$1 7 | fi 8 | base_dir=`pwd` 9 | 10 | # Where should we save checkpoints and tensorboard events? 11 | JOB_NAME=lamb_nvidia_data_32k_chkpt${LOAD_EPOCH}_seq512 12 | OUTPUT_DIR=${base_dir}/bert_model_nvidia_data_outputs 13 | 14 | # Assumes job name in previous seq128 run, will resume training from epoch 18 by default 15 | CHECKPOINT_BASE_PATH=${OUTPUT_DIR}/saved_models/lamb_nvidia_data_64k_seq128 16 | CHECKPOINT_EPOCH_NAME=`basename ${CHECKPOINT_BASE_PATH}/epoch${LOAD_EPOCH}_*` 17 | echo "checkpoint id: $CHECKPOINT_EPOCH_NAME" 18 | 19 | mkdir -p $OUTPUT_DIR 20 | 21 | deepspeed ${base_dir}/deepspeed_train.py \ 22 | --cf ${base_dir}/bert_large_lamb_nvidia_data.json \ 23 | --max_seq_length 512 \ 24 | --output_dir $OUTPUT_DIR \ 25 | --print_steps 1 \ 26 | --deepspeed \ 27 | --deepspeed_transformer_kernel \ 28 | --job_name $JOB_NAME \ 29 | --deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \ 30 | --data_path_prefix /workspace/bert \ 31 | --use_nvidia_dataset \ 32 | --rewarmup \ 33 | --lr_schedule "EE" \ 34 | --attention_dropout_checkpoint \ 35 | --lr_offset 0.0 \ 36 | --load_training_checkpoint ${CHECKPOINT_BASE_PATH} \ 37 | --load_checkpoint_id ${CHECKPOINT_EPOCH_NAME} \ 38 | &> ${JOB_NAME}.log 39 | -------------------------------------------------------------------------------- /training/bing_bert/ds_train_bert_nvidia_data_bsz64k_seq128.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | base_dir=`pwd` 4 | 5 | # Where should we save checkpoints and tensorboard events? 6 | JOB_NAME=lamb_nvidia_data_64k_seq128 7 | OUTPUT_DIR=${base_dir}/bert_model_nvidia_data_outputs 8 | 9 | mkdir -p $OUTPUT_DIR 10 | 11 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \ 12 | --cf ${base_dir}/bert_large_lamb_nvidia_data.json \ 13 | --max_seq_length 128 \ 14 | --output_dir $OUTPUT_DIR \ 15 | --deepspeed \ 16 | --deepspeed_transformer_kernel \ 17 | --print_steps 100 \ 18 | --lr_schedule "EE" \ 19 | --lr_offset 10e-4 \ 20 | --job_name $JOB_NAME \ 21 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \ 22 | --data_path_prefix /workspace/bert \ 23 | --use_nvidia_dataset \ 24 | &> ${JOB_NAME}.log 25 | -------------------------------------------------------------------------------- /training/bing_bert/ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | base_dir=`pwd` 4 | 5 | # Where should we save checkpoints and tensorboard events? 6 | JOB_NAME=adam_4k_seq128_progressive_layer_drop 7 | OUTPUT_DIR=${base_dir}/bert_model_outputs 8 | 9 | mkdir -p $OUTPUT_DIR 10 | 11 | config="--progressive_layer_drop" 12 | 13 | NCCL_TREE_THRESHOLD=0 deepspeed \ 14 | ${base_dir}/deepspeed_train.py \ 15 | --cf ${base_dir}/bert_base_large_lr.json \ 16 | --max_seq_length 128 \ 17 | --output_dir $OUTPUT_DIR \ 18 | --deepspeed \ 19 | --print_steps 100 \ 20 | --lr_schedule "LE" \ 21 | --job_name $JOB_NAME \ 22 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json \ 23 | --data_path_prefix /data/bert \ 24 | ${config} \ 25 | &> ${JOB_NAME}.log 26 | -------------------------------------------------------------------------------- /training/bing_bert/glue_bert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32, 3 | "train_micro_batch_size_per_gpu": 32, 4 | "steps_per_print": 10, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 3e-5, 9 | "weight_decay": 0.0, 10 | "bias_correction": false 11 | } 12 | }, 13 | "gradient_clipping": 1.0, 14 | "fp16": { 15 | "enabled": true 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /training/bing_bert/glue_bert_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32, 3 | "train_micro_batch_size_per_gpu": 4, 4 | "steps_per_print": 10, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 3e-5, 9 | "weight_decay": 0.0, 10 | "bias_correction": false 11 | } 12 | }, 13 | "gradient_clipping": 1.0, 14 | "fp16": { 15 | "enabled": true 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /training/bing_bert/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.0" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 4 | BertForMaskedLM, BertForNextSentencePrediction, 5 | BertForSequenceClassification, BertForMultipleChoice, 6 | BertForTokenClassification, BertForQuestionAnswering) 7 | from .optimization import BertAdam 8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE 9 | -------------------------------------------------------------------------------- /training/bing_bert/pytorch_pretrained_bert/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | try: 5 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 6 | except ModuleNotFoundError: 7 | print( 8 | "pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 9 | "In that case, it requires TensorFlow to be installed. Please see " 10 | "https://www.tensorflow.org/install/ for installation instructions." 11 | ) 12 | raise 13 | 14 | if len(sys.argv) != 5: 15 | # pylint: disable=line-too-long 16 | print( 17 | "Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`" 18 | ) 19 | else: 20 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 21 | TF_CONFIG = sys.argv.pop() 22 | TF_CHECKPOINT = sys.argv.pop() 23 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, 24 | PYTORCH_DUMP_OUTPUT) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /training/bing_bert/requirements.txt: -------------------------------------------------------------------------------- 1 | sklearn 2 | -------------------------------------------------------------------------------- /training/bing_bert/turing/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch.distributed as dist 3 | 4 | logging.basicConfig( 5 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 6 | datefmt='%m/%d/%Y %H:%M:%S', 7 | level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class Logger(): 12 | def __init__(self, cuda=False): 13 | self.logger = logging.getLogger(__name__) 14 | self.cuda = cuda 15 | 16 | def info(self, message, *args, **kwargs): 17 | if (self.cuda and dist.get_rank() == 0) or not self.cuda: 18 | self.logger.info(message, *args, **kwargs) 19 | 20 | def error(self, message, *args, **kwargs): 21 | self.logger.error(message, *args, **kwargs) 22 | -------------------------------------------------------------------------------- /training/bing_bert/turing/text.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | PAD = 0 4 | 5 | 6 | def mask(x): 7 | return x != PAD 8 | 9 | 10 | def torch_long(x): 11 | return torch.LongTensor(x) 12 | -------------------------------------------------------------------------------- /training/cifar/README.md: -------------------------------------------------------------------------------- 1 | Thanks Gopi Kumar for contributing this example, demonstrating how to apply DeepSpeed to CIFAR-10 model. 2 | 3 | `cifar10_tutorial.py` 4 | Baseline CIFAR-10 model. 5 | 6 | `cifar10_deepspeed.py` 7 | DeepSpeed applied CIFAR-10 model. 8 | 9 | `run_ds.sh` 10 | Script for running DeepSpeed applied model. 11 | 12 | `run_ds_moe.sh` 13 | Script for running DeepSpeed model with Mixture of Experts (MoE) integration. 14 | 15 | `run_ds_prmoe.sh` 16 | Script for running DeepSpeed model with Pyramid Residual MoE (PR-MoE) integration. 17 | 18 | * To run baseline CIFAR-10 model - `python cifar10_tutorial.py` 19 | * To run DeepSpeed CIFAR-10 model - `bash run_ds.sh` 20 | * To run DeepSpeed CIFAR-10 model with Mixture of Experts (MoE) - `bash run_ds_moe.sh` 21 | * To run DeepSpeed CIFAR-10 model with Pyramid Residual MoE (PR-MoE) - `bash run_ds_prmoe.sh` 22 | * To run with different data type (default=`fp16`) and zero stages (default=`0`) - `bash run_ds.sh --dtype={fp16|bf16} --stage={0|1|2|3}` 23 | -------------------------------------------------------------------------------- /training/cifar/requirements.txt: -------------------------------------------------------------------------------- 1 | torchvision==0.4.0 2 | pillow>=7.1.0 3 | matplotlib 4 | -------------------------------------------------------------------------------- /training/cifar/run_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed --bind_cores_to_rank cifar10_deepspeed.py --deepspeed $@ 4 | -------------------------------------------------------------------------------- /training/cifar/run_ds_moe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Number of nodes 4 | NUM_NODES=1 5 | # Number of GPUs per node 6 | NUM_GPUS=2 7 | # Size of expert parallel world (should be less than total world size) 8 | EP_SIZE=2 9 | # Number of total experts 10 | EXPERTS=2 11 | 12 | deepspeed --num_nodes=${NUM_NODES}\ 13 | --num_gpus=${NUM_GPUS} \ 14 | --bind_cores_to_rank \ 15 | cifar10_deepspeed.py \ 16 | --log-interval 100 \ 17 | --deepspeed \ 18 | --moe \ 19 | --ep-world-size ${EP_SIZE} \ 20 | --num-experts ${EXPERTS} \ 21 | --top-k 1 \ 22 | --noisy-gate-policy 'RSample' \ 23 | --moe-param-group 24 | -------------------------------------------------------------------------------- /training/cifar/run_ds_prmoe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Number of nodes 4 | NUM_NODES=1 5 | # Number of GPUs per node 6 | NUM_GPUS=2 7 | # Size of expert parallel world (should be less than total world size) 8 | EP_SIZE=2 9 | # Number of total experts, note here we need to pass >= two numbers (numbers can be different) 10 | EXPERTS='2 4' 11 | 12 | deepspeed --num_nodes=${NUM_NODES} --num_gpus=${NUM_GPUS} cifar10_deepspeed.py \ 13 | --log-interval 100 \ 14 | --deepspeed \ 15 | --moe \ 16 | --ep-world-size ${EP_SIZE} \ 17 | --num-experts ${EXPERTS} \ 18 | --top-k 1 \ 19 | --mlp-type 'residual' \ 20 | --noisy-gate-policy 'RSample' \ 21 | --moe-param-group 22 | -------------------------------------------------------------------------------- /training/data_efficiency/gpt_finetuning/bash_script/run_medium_random_ltd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ##################apply random-ltd to fine-tune ptb on GPT-medium (24-layer)############################## 3 | ####see more on random-ltd: https://arxiv.org/abs/2211.11586 4 | export CUDA_VISIBLE_DEVICES=2 5 | mkdir -p ./output/check_medium 6 | python -m torch.distributed.launch --nproc_per_node=1 \ 7 | --master_port 12345 \ 8 | run_clm_no_trainer.py \ 9 | --random_ltd \ 10 | --dataset_name ptb_text_only \ 11 | --dataset_config_name penn_treebank \ 12 | --model_name_or_path gpt2-medium \ 13 | --per_device_train_batch_size 2 \ 14 | --per_device_eval_batch_size 2 \ 15 | --num_train_epochs 2 \ 16 | --deepspeed_config config/ds_config_gpt_medium_random_ltd.json \ 17 | --deepspeed --seed 1234 --num_warmup_steps 100 \ 18 | --output_dir ./output/check_medium &> ./output/check_medium/training.log -------------------------------------------------------------------------------- /training/data_efficiency/gpt_finetuning/config/ds_config_gpt_base_random_ltd.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 4, 3 | "train_micro_batch_size_per_gpu": 2, 4 | "steps_per_print": 2, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 0.0001, 9 | "betas": [0.8,0.999], 10 | "eps": 1e-8, 11 | "weight_decay": 3e-7 12 | } 13 | }, 14 | "zero_optimization": { 15 | "stage": 0 16 | }, 17 | "fp16":{ 18 | "enabled": false 19 | }, 20 | "gradient_clipping": 1.0, 21 | "prescale_gradients": true, 22 | "wall_clock_breakdown" : false, 23 | "data_efficiency": { 24 | "enabled": true, 25 | "data_routing": { 26 | "enabled": true, 27 | "random_ltd":{ 28 | "enabled": true, 29 | "total_layer_num": 12, 30 | "random_ltd_layer_num": 10, 31 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10], 32 | "model_mask_name": "attention_mask", 33 | "model_type": "decoder", 34 | "hidden_state_order": "batch_seq_dim", 35 | "random_ltd_schedule": { 36 | "min_value": 128, 37 | "max_value": 1024, 38 | "schedule_type": "fixed_linear", 39 | "schedule_config": { 40 | "require_steps": 400, 41 | "seq_per_step": 8 42 | } 43 | } 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /training/data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_reduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set these 2 to the same as what you used during map job. We need these 2 4 | # configs to know how many map job result files do we have. 5 | num_workers=1 6 | num_threads=1 7 | # Reduce job only has 1 worker but can accelerate by multithreading. 8 | num_threads_reduce=1 9 | 10 | save_path="/blob/users/conglli/data/analysis_ptb_gpt/" 11 | 12 | metric='total_vocab_freq' 13 | # metric='vocab_rarity' # this requires the result of total_vocab_freq 14 | 15 | dataset_name="ptb_text_only" 16 | dataset_config_name="penn_treebank" 17 | model_name_or_path="gpt2-medium" 18 | 19 | batch_size=1000 20 | 21 | jobname="gpt-ptb-analyzing-${metric}-reduce" 22 | 23 | options=" \ 24 | --analyzing_task reduce \ 25 | --analyzing_metric ${metric} \ 26 | --analyzing_num_workers ${num_workers} \ 27 | --analyzing_num_threads ${num_threads} \ 28 | --analyzing_num_threads_reduce ${num_threads_reduce} \ 29 | --dataset_name ${dataset_name} \ 30 | --dataset_config_name ${dataset_config_name} \ 31 | --model_name_or_path ${model_name_or_path} \ 32 | --per_device_train_batch_size ${batch_size} \ 33 | --output_dir ${save_path}" 34 | 35 | python ../analyze_data.py ${options} &> ${jobname}.log -------------------------------------------------------------------------------- /training/data_efficiency/gpt_finetuning/finetune/ds_config_gpt2_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : GB_SIZE, 3 | "train_micro_batch_size_per_gpu": MB_SIZE, 4 | "steps_per_print": 10, 5 | "zero_optimization": { 6 | "stage": 0 7 | }, 8 | "fp16":{ 9 | "enabled": false 10 | }, 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": true, 13 | "wall_clock_breakdown" : false, 14 | "data_efficiency": { 15 | "enabled": true, 16 | "data_routing": { 17 | "enabled": LTD_ENABLED, 18 | "random_ltd":{ 19 | "enabled": LTD_ENABLED, 20 | "total_layer_num": 12, 21 | "random_ltd_layer_num": 10, 22 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10], 23 | "model_mask_name": "attention_mask", 24 | "model_type": "decoder", 25 | "hidden_state_order": "batch_seq_dim", 26 | "random_ltd_schedule": { 27 | "min_value": LTD_MIN, 28 | "max_value": 1024, 29 | "schedule_type": "fixed_linear", 30 | "schedule_config": { 31 | "require_steps": LTD_STEP, 32 | "seq_per_step": 8 33 | } 34 | } 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /training/data_efficiency/gpt_finetuning/requirement.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.8.0 2 | sentencepiece != 0.1.92 3 | protobuf 4 | transformers == 4.15.0 5 | accelerate -------------------------------------------------------------------------------- /training/data_efficiency/variable_batch_size_and_lr/variable_attn_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_attn_matrix.png -------------------------------------------------------------------------------- /training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr.png -------------------------------------------------------------------------------- /training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr_pipeline.png -------------------------------------------------------------------------------- /training/data_efficiency/vit_finetuning/bash_script/run_cifar_random_ltd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | mkdir -p out/cifar/ 5 | # deepspeed --include worker-0:0 --master_port 60000 main_cifar.py \ 6 | # --deepspeed_config config/ds_config.json \ 7 | # --deepspeed --random_ltd \ 8 | # --dataset cifar10vit224 \ 9 | # --seed 1234 \ 10 | # --printfreq 400 \ 11 | # --arch lvits16r224 \ 12 | # --optimizer sgd \ 13 | # --lr 0.0001 --seq_len 197 \ 14 | # --scheduler constant \ 15 | # --epochs 14 \ 16 | # --batchsize 32 \ 17 | # --data_outdir check/cifar/ | tee -a check/cifar/training.log 18 | 19 | deepspeed --num_nodes 1 --num_gpus 1 --master_port 60000 main_cifar.py \ 20 | --deepspeed_config config/ds_config_cifar_random_ltd.json \ 21 | --deepspeed --random_ltd \ 22 | --dataset cifar10vit224 \ 23 | --seed 1234 \ 24 | --printfreq 400 \ 25 | --arch vits16r224 \ 26 | --optimizer sgd \ 27 | --lr 0.0001 --seq_len 197 \ 28 | --scheduler constant \ 29 | --epochs 14 \ 30 | --batchsize 128 \ 31 | --data_outdir out/cifar/ | tee -a out/cifar/training1.log -------------------------------------------------------------------------------- /training/data_efficiency/vit_finetuning/config/ds_config_cifar_random_ltd.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 32, 3 | "train_micro_batch_size_per_gpu": 32, 4 | "steps_per_print": 200, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 0.0001, 9 | "betas": [0.8,0.999], 10 | "eps": 1e-8, 11 | "weight_decay": 3e-7 12 | } 13 | }, 14 | "zero_optimization": { 15 | "stage": 0 16 | }, 17 | "fp16":{ 18 | "enabled": false 19 | }, 20 | "gradient_clipping": 1.0, 21 | "prescale_gradients": true, 22 | "wall_clock_breakdown" : false, 23 | "data_efficiency": { 24 | "enabled": true, 25 | "data_routing": { 26 | "enabled": true, 27 | "random_ltd":{ 28 | "enabled": true, 29 | "total_layer_num": 12, 30 | "random_ltd_layer_num": 10, 31 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10], 32 | "model_mask_name": null, 33 | "model_type": "decoder", 34 | "hidden_state_order": "batch_seq_dim", 35 | "random_ltd_schedule": { 36 | "min_value": 32, 37 | "max_value": 197, 38 | "schedule_type":"fixed_linear", 39 | "schedule_config": { 40 | "require_steps": 3910, 41 | "seq_per_step": 8 42 | } 43 | } 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /training/data_efficiency/vit_finetuning/config/ds_config_imagenet_random_ltd.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 256, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 200, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 0.0001, 9 | "betas": [0.8,0.999], 10 | "eps": 1e-8, 11 | "weight_decay": 3e-7 12 | } 13 | }, 14 | "zero_optimization": { 15 | "stage": 0 16 | }, 17 | "fp16":{ 18 | "enabled": false 19 | }, 20 | "gradient_clipping": 1.0, 21 | "prescale_gradients": true, 22 | "wall_clock_breakdown" : false, 23 | "data_efficiency": { 24 | "enabled": true, 25 | "data_routing": { 26 | "enabled": true, 27 | "random_ltd":{ 28 | "enabled": true, 29 | "total_layer_num": 12, 30 | "random_ltd_layer_num": 10, 31 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10], 32 | "model_mask_name": null, 33 | "model_type": "decoder", 34 | "hidden_state_order": "batch_seq_dim", 35 | "random_ltd_schedule": { 36 | "min_value": 32, 37 | "max_value": 197, 38 | "schedule_type":"fixed_linear", 39 | "schedule_config": { 40 | "require_steps": 3910, 41 | "seq_per_step": 8 42 | } 43 | } 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /training/data_efficiency/vit_finetuning/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | from .vit import * 17 | from .vit import Block 18 | -------------------------------------------------------------------------------- /training/data_efficiency/vit_finetuning/requirement.txt: -------------------------------------------------------------------------------- 1 | timm==0.6.5 2 | torch>1.10.0 3 | torchvision>0.11.1 4 | mpi4py 5 | -------------------------------------------------------------------------------- /training/data_efficiency/vit_finetuning/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .utils import get_model, get_optimizer, get_scheduler, LossTracker, AverageMeter, ProgressMeter, accuracy,run_cmd 16 | from .get_data import get_dataset 17 | 18 | __all__ = [ "get_dataset", "ImageMemFolder", "AverageMeter", "ProgressMeter", "accuracy", "get_optimizer", "get_scheduler", "get_model", "LossTracker","run_cmd"] 19 | -------------------------------------------------------------------------------- /training/gan/gan_baseline_run.sh: -------------------------------------------------------------------------------- 1 | python gan_baseline_train.py --dataset celeba --cuda --tensorboard_path './runs/baseline' 2 | -------------------------------------------------------------------------------- /training/gan/gan_deepspeed_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 64, 3 | "optimizer": { 4 | "type": "Adam", 5 | "params": { 6 | "lr": 0.0002, 7 | "betas": [ 8 | 0.5, 9 | 0.999 10 | ], 11 | "eps": 1e-8 12 | } 13 | }, 14 | "steps_per_print" : 10 15 | } 16 | -------------------------------------------------------------------------------- /training/gan/gan_deepspeed_run.sh: -------------------------------------------------------------------------------- 1 | deepspeed gan_deepspeed_train.py --dataset celeba --cuda --deepspeed_config gan_deepspeed_config.json --tensorboard_path './runs/deepspeed' 2 | -------------------------------------------------------------------------------- /training/imagenet/assets/resnetplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/imagenet/assets/resnetplot.png -------------------------------------------------------------------------------- /training/imagenet/config/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 256, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 50, 5 | 6 | "optimizer": { 7 | "type": "Adam", 8 | "params": { 9 | "lr": 0.001, 10 | "betas": [ 11 | 0.8, 12 | 0.999 13 | ], 14 | "eps": 1e-8, 15 | "weight_decay": 3e-7 16 | } 17 | }, 18 | 19 | "zero_optimization": { 20 | "stage": 0 21 | }, 22 | "zero_allow_untested_optimizer": true, 23 | "fp16": { 24 | "enabled": false 25 | }, 26 | "gradient_clipping": 0, 27 | "prescale_gradients": false, 28 | "cuda_visible_devices": 0, 29 | "wall_clock_breakdown" : false 30 | } 31 | -------------------------------------------------------------------------------- /training/imagenet/config/ds_fp16_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 256, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 50, 5 | 6 | "optimizer": { 7 | "type": "Adam", 8 | "params": { 9 | "lr": 0.001, 10 | "betas": [ 11 | 0.8, 12 | 0.999 13 | ], 14 | "eps": 1e-8, 15 | "weight_decay": 3e-7 16 | } 17 | }, 18 | 19 | "zero_optimization": { 20 | "stage": 0 21 | }, 22 | "zero_allow_untested_optimizer": true, 23 | "fp16": { 24 | "enabled": true, 25 | "auto_cast": true 26 | }, 27 | "gradient_clipping": 0, 28 | "prescale_gradients": false, 29 | "cuda_visible_devices": 0, 30 | "wall_clock_breakdown" : false 31 | } 32 | -------------------------------------------------------------------------------- /training/imagenet/config/ds_fp16_z1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 256, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 50, 5 | 6 | "optimizer": { 7 | "type": "Adam", 8 | "params": { 9 | "lr": 0.001, 10 | "betas": [ 11 | 0.8, 12 | 0.999 13 | ], 14 | "eps": 1e-8, 15 | "weight_decay": 3e-7 16 | } 17 | }, 18 | 19 | "zero_optimization": { 20 | "stage": 1 21 | }, 22 | "zero_allow_untested_optimizer": true, 23 | "fp16": { 24 | "enabled": true, 25 | "auto_cast": true 26 | }, 27 | "gradient_clipping": 0, 28 | "prescale_gradients": false, 29 | "cuda_visible_devices": 0, 30 | "wall_clock_breakdown" : false 31 | } 32 | -------------------------------------------------------------------------------- /training/imagenet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | -------------------------------------------------------------------------------- /training/imagenet/run_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet 4 | -------------------------------------------------------------------------------- /training/imagenet/run_ds_fp16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet 4 | -------------------------------------------------------------------------------- /training/imagenet/run_ds_fp16_z1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_z1_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet 4 | -------------------------------------------------------------------------------- /training/megatron/README.md: -------------------------------------------------------------------------------- 1 | # Not maintained / deprecated 2 | 3 | > __Warning__ 4 | > all future/current changes are now in new [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed). 5 | -------------------------------------------------------------------------------- /training/offload_states/output_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pytablewriter import MarkdownTableWriter 3 | 4 | 5 | def read_csv(file_path): 6 | return pd.read_csv(file_path) 7 | 8 | df = read_csv('offload_states.log') 9 | df.columns = ['pin_memory', 'non_blocking', 'offload_time', 'load_time'] 10 | 11 | df['ratio_string'] = df['offload_time'].round(2).astype(str) + " / " + df['load_time'].round(2).astype(str) 12 | 13 | result_df = pd.DataFrame({ 14 | 'pin_memory=0_non_blocking=0': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True), 15 | 'pin_memory=0_non_blocking=1': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True), 16 | 'pin_memory=1_non_blocking=0': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True), 17 | 'pin_memory=1_non_blocking=1': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True) 18 | }) 19 | result_df = result_df.dropna() 20 | result_df.index = range(1, len(result_df) + 1) 21 | result_df.index.name = 'trial' 22 | # print(result_df) 23 | 24 | writer = MarkdownTableWriter() 25 | writer.from_dataframe(result_df, 26 | add_index_column=True, 27 | ) 28 | writer.write_table() -------------------------------------------------------------------------------- /training/offload_states/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | NGPUS=4 2 | HIDDEN_SIZE=32768 3 | NUM_LAYERS=4 4 | 5 | TRIALS=10 6 | 7 | PIN_MEMORY_OPTS=(0 1) 8 | NON_BLOCKING_OPTS=(0 1) 9 | 10 | for i in $(seq 1 $TRIALS); do 11 | for PIN_MEMORY in "${PIN_MEMORY_OPTS[@]}"; do 12 | PIN_MEMORY_ARG="" 13 | if [ $PIN_MEMORY -eq 1 ]; then 14 | PIN_MEMORY_ARG="--pin_memory" 15 | fi 16 | 17 | for NON_BLOCKING in "${NON_BLOCKING_OPTS[@]}"; do 18 | NON_BLOCKING_ARG="" 19 | if [ $NON_BLOCKING -eq 1 ]; then 20 | NON_BLOCKING_ARG="--non_blocking" 21 | fi 22 | 23 | echo "Running iteration $i" 24 | deepspeed --num_gpus=$NGPUS offload_states.py --hidden_dim $HIDDEN_SIZE --nlayers $NUM_LAYERS $PIN_MEMORY_ARG $NON_BLOCKING_ARG 25 | done 26 | done 27 | done 28 | python output_table.py 29 | -------------------------------------------------------------------------------- /training/pipeline_parallelism/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 256, 3 | "train_micro_batch_size_per_gpu" : 8, 4 | 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 0.001, 9 | "betas": [ 10 | 0.9, 11 | 0.999 12 | ], 13 | "eps": 1e-8 14 | } 15 | }, 16 | 17 | "steps_per_print" : 10, 18 | "wall_clock_breakdown" : false 19 | } 20 | -------------------------------------------------------------------------------- /training/pipeline_parallelism/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed train.py --deepspeed_config=ds_config.json -p 2 --steps=200 4 | -------------------------------------------------------------------------------- /training/stable_diffusion/mytrainbash.sh: -------------------------------------------------------------------------------- 1 | export MODEL_NAME="stabilityai/stable-diffusion-2-1-base" 2 | export OUTPUT_DIR="./sd-distill-v21" 3 | 4 | if [ ! -d "$OUTPUT_DIR" ]; then 5 | mkdir "$OUTPUT_DIR" 6 | echo "Folder '$OUTPUT_DIR' created" 7 | else 8 | echo "Folder '$OUTPUT_DIR' already exists" 9 | fi 10 | 11 | 12 | accelerate launch train_sd_distil_lora.py \ 13 | --pretrained_model_name_or_path=$MODEL_NAME \ 14 | --output_dir=$OUTPUT_DIR \ 15 | --default_prompt="A man dancing" \ 16 | --resolution=512 \ 17 | --train_batch_size=1 \ 18 | --gradient_accumulation_steps=1 \ 19 | --learning_rate=5e-6 \ 20 | --lr_scheduler="constant" \ 21 | --lr_warmup_steps=0 22 | -------------------------------------------------------------------------------- /training/stable_diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.16.0 2 | torchvision 3 | transformers>=4.25.1 4 | ftfy 5 | tensorboard 6 | Jinja2 7 | -------------------------------------------------------------------------------- /training/tensor_parallel/README.md: -------------------------------------------------------------------------------- 1 | # tensor parallel example 2 | This project is adapted from https://github.com/tatsu-lab/stanford_alpaca. 3 | We only modified the ds_config to enable tensor parallelism and more detailed logging, as an example use case. 4 | 5 | **Script** 6 | 7 | ``` bash run.sh ``` or ```bash run.sh MODE``` 8 | 9 | 10 | -------------------------------------------------------------------------------- /training/tensor_parallel/configs/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupDecayLR", 16 | "params": { 17 | "total_num_steps": "auto", 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | "zero_optimization": { 24 | "stage": 1, 25 | "gather_16bit_weights_on_model_save": true 26 | }, 27 | "tensor_parallel":{ 28 | "autotp_size": 4 29 | }, 30 | "gradient_accumulation_steps": "auto", 31 | "gradient_clipping": "auto", 32 | "steps_per_print": 1, 33 | "train_batch_size": "auto", 34 | "train_micro_batch_size_per_gpu": "auto", 35 | "wall_clock_breakdown": false 36 | } -------------------------------------------------------------------------------- /training/tensor_parallel/configs/ds_config_temp.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupDecayLR", 16 | "params": { 17 | "total_num_steps": "auto", 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | "zero_optimization": { 24 | "stage": ${zero_stage}, 25 | "gather_16bit_weights_on_model_save": true 26 | }, 27 | "tensor_parallel":{ 28 | "autotp_size": ${autotp_size} 29 | }, 30 | "gradient_accumulation_steps": "auto", 31 | "gradient_clipping": "auto", 32 | "steps_per_print": 1, 33 | "train_batch_size": "auto", 34 | "train_micro_batch_size_per_gpu": "auto", 35 | "wall_clock_breakdown": false 36 | } -------------------------------------------------------------------------------- /training/tensor_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.50.1 2 | deepspeed>=0.16.4 3 | accelerate==1.6.0 4 | numpy 5 | rouge_score 6 | fire 7 | openai==0.28.0 8 | torch 9 | sentencepiece 10 | tokenizers>=0.13.3 11 | --------------------------------------------------------------------------------