├── .github
└── workflows
│ └── formatting.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── applications
├── DeepSpeed-Chat
│ ├── .gitignore
│ ├── README.md
│ ├── assets
│ │ ├── image
│ │ │ ├── 1.3B-breakdown.png
│ │ │ ├── Banner-benchmark.png
│ │ │ ├── RLHF.png
│ │ │ ├── democrat2.png
│ │ │ ├── ds-chat-single.gif
│ │ │ ├── ds-chat.gif
│ │ │ ├── ds-shiba.png
│ │ │ ├── e2e_RLHF.png
│ │ │ ├── four_blocks.png
│ │ │ ├── ppo_trainer.png
│ │ │ ├── reward_function.png
│ │ │ └── shiba.png
│ │ └── video
│ │ │ └── release_v3.mp4
│ ├── chat.py
│ ├── dschat
│ │ ├── rlhf
│ │ │ ├── ppo_trainer.py
│ │ │ └── rlhf_engine.py
│ │ └── utils
│ │ │ ├── data
│ │ │ ├── data_utils.py
│ │ │ └── raw_datasets.py
│ │ │ ├── ds_utils.py
│ │ │ ├── model
│ │ │ ├── model_utils.py
│ │ │ └── reward_model.py
│ │ │ ├── module
│ │ │ └── lora.py
│ │ │ ├── perf.py
│ │ │ └── utils.py
│ ├── e2e_rlhf.py
│ ├── inference
│ │ └── chatbot.py
│ ├── requirements.txt
│ ├── setup.py
│ ├── tests
│ │ └── test_training.py
│ └── training
│ │ ├── README.md
│ │ ├── step1_supervised_finetuning
│ │ ├── README.md
│ │ ├── evaluation_scripts
│ │ │ └── run_prompt.sh
│ │ ├── main.py
│ │ ├── prompt_eval.py
│ │ ├── training_log_output
│ │ │ └── opt-1.3b-globalBatchSize128.log
│ │ └── training_scripts
│ │ │ ├── README.md
│ │ │ ├── llama2
│ │ │ ├── run_llama2_7b.sh
│ │ │ └── run_llama2_7b_lora.sh
│ │ │ ├── opt
│ │ │ ├── multi_node
│ │ │ │ └── run_66b.sh
│ │ │ ├── single_gpu
│ │ │ │ ├── run_1.3b.sh
│ │ │ │ └── run_6.7b_lora.sh
│ │ │ └── single_node
│ │ │ │ ├── run_1.3b.sh
│ │ │ │ ├── run_1.3b_lora.sh
│ │ │ │ ├── run_13b.sh
│ │ │ │ ├── run_30b_lora.sh
│ │ │ │ ├── run_6.7b.sh
│ │ │ │ └── sweep
│ │ │ │ ├── README.md
│ │ │ │ ├── run_single.sh
│ │ │ │ └── run_step1_sweep.sh
│ │ │ └── other_language
│ │ │ ├── run_chinese.sh
│ │ │ └── run_japanese.sh
│ │ ├── step2_dpo_finetuning
│ │ ├── README.md
│ │ ├── main.py
│ │ ├── training_log_output
│ │ │ └── opt-350M_globalBatchSize-32.log
│ │ └── training_scripts
│ │ │ ├── README.md
│ │ │ ├── llama2
│ │ │ ├── run_llama2_7b.sh
│ │ │ └── run_llama2_7b_lora.sh
│ │ │ └── opt
│ │ │ ├── multi_node
│ │ │ └── run_350m.sh
│ │ │ ├── single_gpu
│ │ │ └── run_350m.sh
│ │ │ └── single_node
│ │ │ ├── run_350m.sh
│ │ │ └── sweep
│ │ │ ├── README.md
│ │ │ ├── run_single.sh
│ │ │ └── run_step2_sweep.sh
│ │ ├── step2_reward_model_finetuning
│ │ ├── README.md
│ │ ├── evaluation_scripts
│ │ │ └── run_eval.sh
│ │ ├── main.py
│ │ ├── rw_eval.py
│ │ ├── training_log_output
│ │ │ └── opt-350m_globalBatchSize-64.log
│ │ └── training_scripts
│ │ │ ├── README.md
│ │ │ ├── llama2
│ │ │ ├── run_llama2_7b.sh
│ │ │ └── run_llama2_7b_lora.sh
│ │ │ └── opt
│ │ │ ├── multi_node
│ │ │ └── run_350m.sh
│ │ │ ├── single_gpu
│ │ │ └── run_350m.sh
│ │ │ └── single_node
│ │ │ ├── run_350m.sh
│ │ │ └── sweep
│ │ │ ├── README.md
│ │ │ ├── run_single.sh
│ │ │ └── run_step2_sweep.sh
│ │ └── step3_rlhf_finetuning
│ │ ├── BenckmarkSetting.md
│ │ ├── README.md
│ │ ├── main.py
│ │ ├── training_log_output
│ │ └── actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log
│ │ └── training_scripts
│ │ ├── README.md
│ │ ├── llama2
│ │ ├── run_llama2_7b.sh
│ │ ├── run_llama2_7b_lora.sh
│ │ └── run_llama2_7b_mixz.sh
│ │ └── opt
│ │ ├── multi_node
│ │ └── run_66b.sh
│ │ ├── single_gpu
│ │ ├── run_1.3b.sh
│ │ └── run_6.7b_lora.sh
│ │ └── single_node
│ │ ├── run_1.3b.sh
│ │ ├── run_1.3b_lora.sh
│ │ ├── run_13b.sh
│ │ ├── run_30b_lora.sh
│ │ ├── run_6.7b.sh
│ │ └── sweep
│ │ ├── README.md
│ │ ├── run_single.sh
│ │ └── run_step3_sweep.sh
└── DeepSpeed-VisualChat
│ ├── README.md
│ ├── assets
│ ├── banner.png
│ ├── ceos.png
│ ├── friends.png
│ ├── hero-figure.png
│ └── model.png
│ ├── chat
│ ├── README.md
│ ├── chat.py
│ └── chat_scripts
│ │ └── run.sh
│ ├── eval
│ ├── README.md
│ ├── batch_generation.py
│ ├── eval_data
│ │ ├── eval_comprehensive.json
│ │ ├── eval_robustness.json
│ │ ├── eval_single.json
│ │ └── images
│ │ │ ├── cats
│ │ │ ├── 1806905748_adb926a0a0.jpg
│ │ │ ├── british_shorthair.jpg
│ │ │ └── cat.png
│ │ │ ├── friends
│ │ │ ├── can-count1.jpg
│ │ │ ├── can-count2.jpg
│ │ │ ├── wrong-count1.jpg
│ │ │ └── wrong-count2.jpg
│ │ │ ├── singles
│ │ │ ├── 1.jpg
│ │ │ ├── 2.jpg
│ │ │ ├── 202160027_b319c4166e.jpg
│ │ │ ├── 50.jpg
│ │ │ ├── extreme_ironing.jpg
│ │ │ └── waterview.jpg
│ │ │ ├── tech-ceo
│ │ │ ├── gate1.jpg
│ │ │ ├── jobs1.jpg
│ │ │ └── musk1.jpg
│ │ │ └── zootopia
│ │ │ ├── z1.png
│ │ │ ├── z2.png
│ │ │ ├── z2a.png
│ │ │ └── z3.png
│ ├── eval_scripts
│ │ └── run_batch.sh
│ └── results
│ │ ├── eval_comprehensive
│ │ ├── ours-set1_best_eval.csv
│ │ ├── ours-set1_final.csv
│ │ ├── ours-set2_best_eval.csv
│ │ └── ours-set2_final.csv
│ │ ├── eval_robustness
│ │ ├── ours-set1_best_eval.csv
│ │ ├── ours-set1_final.csv
│ │ ├── ours-set2_best_eval.csv
│ │ └── ours-set2_final.csv
│ │ └── eval_single
│ │ ├── ours-single_best_eval.csv
│ │ └── ours-single_final.csv
│ ├── helper
│ ├── README.md
│ ├── extract_qwen_vl.py
│ └── qwen_clip
│ │ ├── config.json
│ │ └── preprocessor_config.json
│ ├── requirements.txt
│ ├── training
│ ├── README.md
│ ├── main.py
│ └── training_scripts
│ │ └── run_7b.sh
│ └── utils
│ ├── data
│ ├── DST.py
│ ├── __init__.py
│ ├── aokvqa_dataset.py
│ ├── builder.py
│ ├── cc_sbu_align_dataset.py
│ ├── coco_caption_dataset.py
│ ├── dial_dataset.py
│ ├── llava_dataset.py
│ ├── llava_otter_blend_dataset.py
│ ├── ocr_vqa_dataset.py
│ ├── otter_mimicit_cgd_dataset.py
│ ├── otter_mimicit_sd_dataset.py
│ ├── otter_mimicit_sn_dataset.py
│ ├── otter_mimicit_tvc_dataset.py
│ ├── otter_mimicit_vst_dataset.py
│ ├── sparkles_dialogue_dataset.py
│ ├── utils.py
│ └── vqa_dataset.py
│ ├── ds_utils.py
│ ├── model
│ ├── __init__.py
│ ├── modeling_dsvl.py
│ ├── third_party_model
│ │ ├── hf_model
│ │ │ ├── configuration_llama.py
│ │ │ └── modeling_llama.py
│ │ └── qwen_clip
│ │ │ └── qwen_clip.py
│ └── vis_proj.py
│ ├── module
│ └── lora.py
│ └── utils.py
├── benchmarks
├── README.md
├── communication
│ ├── README.md
│ ├── __init__.py
│ ├── all_gather.py
│ ├── all_reduce.py
│ ├── all_to_all.py
│ ├── broadcast.py
│ ├── constants.py
│ ├── pt2pt.py
│ ├── run_all.py
│ └── utils.py
├── deepcompile
│ ├── .gitignore
│ ├── README.md
│ ├── configs
│ │ ├── ddp_config.yaml.template
│ │ ├── ds_config.json.template
│ │ ├── ds_config.yaml.template
│ │ ├── fsdp_config.yaml.template
│ │ └── singlegpu_config.yaml.template
│ ├── gen_chart_acc_steps.py
│ ├── generate_conf.py
│ ├── hostfile_n4
│ ├── plot.py
│ ├── plot_common.py
│ ├── results
│ │ ├── acc_step_1
│ │ │ └── throughput
│ │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs1.png
│ │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs2.png
│ │ │ │ ├── chart_throughput_Llama-3-70B_np32_bs4.png
│ │ │ │ ├── chart_throughput_Mixtral-8x7B_np32_bs1.png
│ │ │ │ ├── chart_throughput_Mixtral-8x7B_np32_bs2.png
│ │ │ │ └── chart_throughput_Mixtral-8x7B_np32_bs4.png
│ │ └── acc_step_1_16
│ │ │ └── throughput
│ │ │ ├── chart_throughput_Llama-3-70B_np32_bs1.png
│ │ │ └── chart_throughput_Mixtral-8x7B_np32_bs1.png
│ ├── run.sh
│ ├── run_bench.sh
│ ├── run_bench_acc.sh
│ ├── run_bench_lm.py
│ ├── run_bench_offload.sh
│ ├── run_bench_z1.sh
│ └── run_multinode.sh
└── inference
│ ├── README.md
│ ├── bert-bench.py
│ ├── collect_results.py
│ ├── deepspeedometer
│ ├── README.md
│ ├── configs
│ │ ├── 128k-120.yaml
│ │ ├── 1300-120.yaml
│ │ ├── 2600-60.yaml
│ │ └── 500-500.yaml
│ ├── pyproject.toml
│ ├── run_example.sh
│ ├── src
│ │ └── deepspeedometer
│ │ │ ├── __init__.py
│ │ │ ├── arg_parsing.py
│ │ │ ├── benchmark_runner.py
│ │ │ ├── clients
│ │ │ ├── __init__.py
│ │ │ ├── azure_ml_client.py
│ │ │ ├── base.py
│ │ │ ├── dummy_client.py
│ │ │ ├── fastgen_client.py
│ │ │ ├── openai_client.py
│ │ │ └── vllm_client.py
│ │ │ ├── config.py
│ │ │ ├── prompt.py
│ │ │ ├── response.py
│ │ │ └── sample_input.py
│ └── tests
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_benchmark.py
│ │ ├── test_config.py
│ │ ├── test_early_stop.py
│ │ └── test_prompt.py
│ ├── gpt-bench.py
│ ├── mii
│ ├── A6000_benchmarks_example.PNG
│ ├── README.md
│ ├── plot_config.yaml
│ ├── requirements.txt
│ ├── run_all.sh
│ ├── run_aml.sh
│ ├── run_benchmark.py
│ ├── run_example.sh
│ ├── run_fp6.sh
│ └── src
│ │ ├── __init__.py
│ │ ├── client.py
│ │ ├── defaults.py
│ │ ├── plot_effective_throughput.py
│ │ ├── plot_latency_percentile.py
│ │ ├── plot_repl_scale.py
│ │ ├── plot_th_lat.py
│ │ ├── plot_tp_sizes.py
│ │ ├── postprocess_results.py
│ │ ├── random_query_generator.py
│ │ ├── sample_input.py
│ │ ├── server.py
│ │ └── utils.py
│ ├── requirements.txt
│ ├── run_model.sh
│ ├── run_triton_benchmark.sh
│ ├── sweep.sh
│ └── triton-bert-benchmark.py
├── compression
├── README.md
├── bert
│ ├── README.md
│ ├── bash_script
│ │ ├── XTC
│ │ │ ├── layer_reduction.sh
│ │ │ ├── layer_reduction_1bit.sh
│ │ │ └── quant_1bit.sh
│ │ ├── ZeroQuant
│ │ │ ├── zero_quant.sh
│ │ │ └── zero_quant_lkd.sh
│ │ ├── layer_reduction.sh
│ │ ├── pruning_head.sh
│ │ ├── pruning_row.sh
│ │ ├── pruning_sparse.sh
│ │ ├── pruning_sparse_snip_momentum.sh
│ │ ├── quant_activation.sh
│ │ └── quant_weight.sh
│ ├── config
│ │ ├── XTC
│ │ │ ├── ds_config_W1A8_Qgroup1_fp32.json
│ │ │ ├── ds_config_layer_reduction_W1Q8_fp32.json
│ │ │ └── ds_config_layer_reduction_fp16.json
│ │ ├── ZeroQuant
│ │ │ ├── ds_config_W48A8_Qgroup48_lkd_fp32.json
│ │ │ └── ds_config_W8A8_Qgroup48_fp32.json
│ │ ├── ds_config.json
│ │ ├── ds_config_TEMPLATE.json
│ │ ├── ds_config_W1A8_Qgroup64_fp16.json
│ │ ├── ds_config_W1A8_Qgroup64_fp32.json
│ │ ├── ds_config_W1or2A8_Qgroup64_fp16.json
│ │ └── ds_config_structural_pruning_TEMPLATE.json
│ ├── huggingface_transformer
│ │ └── modeling_bert.py
│ ├── requirements.txt
│ ├── run_glue_lkd.py
│ ├── run_glue_no_trainer.py
│ └── util.py
├── cifar
│ ├── README.md
│ ├── config
│ │ ├── ds_config.json
│ │ └── ds_config_channel_prune.json
│ ├── resnet.py
│ ├── run_compress.sh
│ ├── train.py
│ └── utils.py
└── gpt2
│ ├── README.md
│ ├── bash_script
│ └── run_zero_quant.sh
│ ├── config
│ ├── ds_config.json
│ ├── ds_config_W4or8A8_Qgroup64_fp16.json
│ ├── ds_config_W4or8A8_Qgroup64_fp32.json
│ ├── ds_config_W8A8_Qgroup64_fp16.json
│ └── ds_config_W8A8_Qgroup64_fp32.json
│ ├── requirements.txt
│ └── run_clm_no_trainer.py
├── deepnvme
├── file_access
│ ├── README.md
│ ├── aio_load_cpu_tensor.py
│ ├── aio_load_gpu_tensor.py
│ ├── aio_store_cpu_tensor.py
│ ├── aio_store_gpu_tensor.py
│ ├── gds_load_gpu_tensor.py
│ ├── gds_store_gpu_tensor.py
│ ├── media
│ │ └── deepnvme_ops_report.png
│ ├── py_load_cpu_tensor.py
│ ├── py_load_gpu_tensor.py
│ ├── py_store_cpu_tensor.py
│ ├── py_store_gpu_tensor.py
│ ├── run_load_tensor.sh
│ ├── run_store_tensor.sh
│ └── utils.py
├── model_checkpoint
│ ├── README.md
│ ├── deepspeed_save_model.py
│ ├── requirements.txt
│ ├── save_model_utils.py
│ ├── torch
│ │ ├── serialization_fast_v2.6.0.py
│ │ └── serialization_orig_v2.6.0.py
│ ├── torch_save_model.py
│ ├── torch_save_tensor.py
│ └── torch_save_utils.py
└── zero_inference
│ ├── README.md
│ └── media
│ ├── nvme_config.png
│ ├── zero_inf_mem_use_cpu.png
│ └── zero_inf_mem_use_gds.png
├── evaluation
└── inference
│ └── human_eval
│ ├── README.md
│ └── run_human_eval.py
├── inference
├── huggingface
│ ├── README.md
│ ├── automatic-speech-recognition
│ │ ├── README.md
│ │ ├── requirements.txt
│ │ └── test-wav2vec2.py
│ ├── fill-mask
│ │ ├── README.md
│ │ ├── requirements.txt
│ │ ├── test-bert.py
│ │ ├── test-electra.py
│ │ └── test-roberta.py
│ ├── stable-diffusion
│ │ ├── README.md
│ │ ├── local_pipeline_stable_diffusion.py
│ │ ├── requirements.txt
│ │ └── test-stable-diffusion.py
│ ├── text-generation
│ │ ├── README.md
│ │ ├── arguments.py
│ │ ├── ds-hf-compare.py
│ │ ├── inference-test.py
│ │ ├── requirements.txt
│ │ ├── run-generation-script
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── sample_query.txt
│ │ │ ├── single_query.txt
│ │ │ ├── test-gpt.sh
│ │ │ └── test-run-generation.py
│ │ └── utils.py
│ ├── text2text-generation
│ │ ├── README.md
│ │ ├── requirements.txt
│ │ └── test-t5.py
│ ├── translation
│ │ ├── README.md
│ │ ├── requirements.txt
│ │ └── test-t5-base.py
│ └── zero_inference
│ │ ├── README.md
│ │ ├── images
│ │ └── over_v1.png
│ │ ├── model-support.md
│ │ ├── requirements.txt
│ │ ├── run_bloom175b_a6000.sh
│ │ ├── run_llama2_70b_a6000.sh
│ │ ├── run_model.py
│ │ ├── run_model.sh
│ │ ├── run_opt175b_a6000.sh
│ │ ├── run_opt1p3b_a6000.sh
│ │ ├── run_opt30b_a6000.sh
│ │ ├── run_opt66b_a6000.sh
│ │ ├── timer.py
│ │ └── utils.py
├── mii
│ ├── README.md
│ ├── non-persistent
│ │ ├── README.md
│ │ ├── falcon.py
│ │ ├── llama2.py
│ │ ├── mixtral.py
│ │ └── pipeline.py
│ ├── persistent
│ │ ├── README.md
│ │ ├── client.py
│ │ ├── serve.py
│ │ └── terminate.py
│ └── requirements.txt
└── sglang
│ ├── README.md
│ ├── ds_offload_cpu.json
│ ├── ds_offload_nvme_aio.json
│ ├── ds_offload_nvme_gds.json
│ ├── run_llama3_1B.sh
│ ├── run_llama3_70B.sh
│ └── run_llama3_8B.sh
├── scripts
└── check-license.py
└── training
├── BingBertGlue
├── glue_bert_base.json
├── glue_bert_large.json
├── nvidia
│ ├── modeling.py
│ ├── modelingpreln.py
│ └── modelingpreln_layerdrop.py
├── nvidia_bert_dataset_provider.py
├── pytorch_pretrained_bert
│ ├── __init__.py
│ ├── __main__.py
│ ├── convert_tf_checkpoint_to_pytorch.py
│ ├── file_utils.py
│ ├── modeling.py
│ ├── optimization.py
│ └── tokenization.py
├── run_glue_bert_base_finetune.sh
├── run_glue_bert_large_finetune.sh
├── run_glue_classifier_bert_base.py
├── run_glue_classifier_bert_large.py
└── turing
│ ├── dataset.py
│ ├── file_utils.py
│ ├── logger.py
│ ├── loss.py
│ ├── models.py
│ ├── sources.py
│ ├── text.py
│ └── utils.py
├── BingBertSquad
├── 1-bit_adam
│ ├── mpi_ethernet
│ │ ├── deepspeed_onebitadam_bsz96_config.json
│ │ ├── run_squad_deepspeed_onebitadam.sh
│ │ └── run_squad_mpi_onebitadam.sh
│ ├── mpi_infiniband
│ │ ├── deepspeed_onebitadam_bsz96_config.json
│ │ ├── run_squad_deepspeed_onebitadam.sh
│ │ └── run_squad_mpi_onebitadam.sh
│ └── nccl
│ │ ├── deepspeed_onebitadam_bsz96_config.json
│ │ └── run_squad_deepspeed_onebitadam.sh
├── NOTICE.txt
├── ckpt
│ └── bert-large-uncased-whole-word-masking-config.json
├── convert_bert_ckpt_to_deepspeed.py
├── deepspeed_bsz24_config.json
├── evaluate-v1.1.py
├── evaluate.py
├── nvidia_run_squad_baseline.py
├── nvidia_run_squad_deepspeed.py
├── pytorch_pretrained_bert
│ ├── __init__.py
│ ├── file_utils.py
│ ├── modeling.py
│ ├── optimization.py
│ └── tokenization.py
├── run_hf.sh
├── run_squad_baseline.sh
├── run_squad_deepspeed.sh
├── turing
│ ├── file_utils.py
│ ├── loss.py
│ ├── modelingpreln_layerdrop.py
│ ├── nvidia_modeling.py
│ └── nvidia_modelingpreln.py
└── utils.py
├── DeepSpeed-Domino
├── README.md
├── domino
│ ├── gpt_model.py
│ ├── language_model.py
│ └── training.py
├── pretrain_gpt.py
├── pretrain_gpt3_13b.sh
├── pretrain_gpt3_6.7b.sh
└── requirements.txt
├── HelloDeepSpeed
├── README.md
├── requirements.txt
├── run.sh
├── run_ds.sh
├── tests
│ ├── __init__.py
│ └── test_train_bert.py
├── train_bert.py
└── train_bert_ds.py
├── MoQ
├── README.md
├── huggingface-transformers
│ └── examples
│ │ └── research_projects
│ │ └── lxmert
│ │ └── requirements.txt
├── requirements.txt
├── run.sh
├── run_glue.py
└── test.json
├── autotuning
├── .gitignore
├── README.md
└── hf
│ ├── README.md
│ ├── bert-base
│ ├── README.md
│ ├── ds_config_tune.json
│ └── test_tune.sh
│ ├── bert-large
│ ├── README.md
│ ├── ds_config_tune.json
│ └── test_tune.sh
│ ├── deberta
│ ├── README.md
│ ├── ds_config_fp16_tune.json
│ └── test_tune.sh
│ ├── distilbert
│ ├── README.md
│ ├── ds_config_tune.json
│ └── test_tune.sh
│ ├── dsconfigs
│ ├── ds_config_fp16_tune.json
│ ├── ds_config_fp16_z0.json
│ ├── ds_config_fp16_z1.json
│ ├── ds_config_fp16_z2.json
│ ├── ds_config_fp16_z3.json
│ ├── ds_config_tune.json
│ ├── ds_config_z0.json
│ ├── ds_config_z1.json
│ ├── ds_config_z2.json
│ └── ds_config_z3.json
│ ├── gpt2-large
│ ├── README.md
│ └── test_tune.sh
│ ├── gpt2-medium
│ ├── README.md
│ └── test_tune.sh
│ ├── gpt2-xl
│ ├── README.md
│ └── test_tune.sh
│ └── gpt2
│ ├── README.md
│ └── test_tune.sh
├── bing_bert
├── 01_adam
│ ├── mpi_ethernet
│ │ ├── deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json
│ │ ├── deepspeed_bsz4k_01adam_config_seq512_mpi_ethernet.json
│ │ ├── ds_train_bert_01adam_bsz4k_seq128_mpi_ethernet.sh
│ │ └── ds_train_bert_01adam_bsz4k_seq512_mpi_ethernet.sh
│ ├── mpi_infiniband
│ │ ├── deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json
│ │ ├── deepspeed_bsz4k_01adam_config_seq512_mpi_infiniband.json
│ │ ├── ds_train_bert_01adam_bsz4k_seq128_mpi_infiniband.sh
│ │ └── ds_train_bert_01adam_bsz4k_seq512_mpi_infiniband.sh
│ └── nccl
│ │ ├── deepspeed_bsz4k_01adam_config_seq128_nccl.json
│ │ ├── deepspeed_bsz4k_01adam_config_seq512_nccl.json
│ │ ├── ds_train_bert_01adam_bsz4k_seq128_nccl.sh
│ │ └── ds_train_bert_01adam_bsz4k_seq512_nccl.sh
├── 1-bit_adam
│ ├── mpi_ethernet
│ │ ├── deepspeed_bsz4k_onebitadam_config_seq128_mpi_ethernet.json
│ │ ├── ds_train_bert_onebitadam_bsz4k_seq128_mpi_ethernet.sh
│ │ └── mpi_train_bert_onebitadam_bsz4k_seq128_ethernet.sh
│ ├── mpi_infiniband
│ │ ├── deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json
│ │ ├── ds_train_bert_onebitadam_bsz4k_seq128_mpi_infiniband.sh
│ │ └── mpi_train_bert_onebitadam_bsz4k_seq128_infiniband.sh
│ └── nccl
│ │ ├── deepspeed_bsz4k_onebitadam_config_seq128_nccl.json
│ │ └── ds_train_bert_onebitadam_bsz4k_seq128_nccl.sh
├── 1-bit_lamb
│ ├── mpi_ethernet
│ │ ├── deepspeed_bsz32k_onebitlamb_config_seq512_mpi_ethernet.json
│ │ ├── deepspeed_bsz64k_onebitlamb_config_seq128_mpi_ethernet.json
│ │ ├── ds_train_bert_onebitlamb_bsz32k_seq512_mpi_ethernet.sh
│ │ ├── ds_train_bert_onebitlamb_bsz64k_seq128_mpi_ethernet.sh
│ │ ├── mpi_train_bert_onebitlamb_bsz32k_seq512_ethernet.sh
│ │ └── mpi_train_bert_onebitlamb_bsz64k_seq128_ethernet.sh
│ ├── mpi_infiniband
│ │ ├── deepspeed_bsz32k_onebitlamb_config_seq512_mpi_infiniband.json
│ │ ├── deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json
│ │ ├── ds_train_bert_onebitlamb_bsz32k_seq512_mpi_infiniband.sh
│ │ ├── ds_train_bert_onebitlamb_bsz64k_seq128_mpi_infiniband.sh
│ │ ├── mpi_train_bert_onebitlamb_bsz32k_seq512_infiniband.sh
│ │ └── mpi_train_bert_onebitlamb_bsz64k_seq128_infiniband.sh
│ └── nccl
│ │ ├── deepspeed_bsz32k_onebitlamb_config_seq512_nccl.json
│ │ ├── deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json
│ │ ├── ds_train_bert_onebitlamb_bsz32k_seq512_nccl.sh
│ │ └── ds_train_bert_onebitlamb_bsz64k_seq128_nccl.sh
├── NOTICE.txt
├── README.md
├── bert_base.json
├── bert_base_large_lr.json
├── bert_dataset_provider.py
├── bert_large.json
├── bert_large_lamb.json
├── bert_large_lamb_nvidia_data.json
├── bing_bert_dataset_provider.py
├── data_worker.py
├── deepspeed_bsz32k_lamb_config_seq512.json
├── deepspeed_bsz4k_progressive_layer_drop_config_seq128.json
├── deepspeed_bsz64k_lamb_config_seq128.json
├── deepspeed_train.py
├── ds_sa_train_bert_bsz64k_seq128.sh
├── ds_train_bert_bsz32k_seq512.sh
├── ds_train_bert_bsz64k_seq128.sh
├── ds_train_bert_nvidia_data_bsz32k_seq512.sh
├── ds_train_bert_nvidia_data_bsz64k_seq128.sh
├── ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh
├── glue_bert_base.json
├── glue_bert_large.json
├── nvidia
│ ├── modelingpreln.py
│ └── modelingpreln_layerdrop.py
├── nvidia_bert_dataset_provider.py
├── pytorch_pretrained_bert
│ ├── __init__.py
│ ├── __main__.py
│ ├── convert_tf_checkpoint_to_pytorch.py
│ ├── file_utils.py
│ ├── modeling.py
│ ├── optimization.py
│ └── tokenization.py
├── requirements.txt
├── run_glue_bert_base_finetune.sh
├── run_glue_bert_large_finetune.sh
├── run_glue_classifier_bert_base.py
├── run_glue_classifier_bert_large.py
├── timer.py
├── turing
│ ├── dataset.py
│ ├── file_utils.py
│ ├── logger.py
│ ├── loss.py
│ ├── models.py
│ ├── sources.py
│ ├── text.py
│ └── utils.py
└── utils.py
├── cifar
├── LICENSE
├── NOTICE.txt
├── README.md
├── cifar10_deepspeed.py
├── cifar10_tutorial.py
├── requirements.txt
├── run_ds.sh
├── run_ds_moe.sh
└── run_ds_prmoe.sh
├── data_efficiency
├── gpt_finetuning
│ ├── README.md
│ ├── analyze_data.py
│ ├── bash_script
│ │ ├── run_base_random_ltd.sh
│ │ └── run_medium_random_ltd.sh
│ ├── config
│ │ ├── ds_config_gpt_base_random_ltd.json
│ │ └── ds_config_gpt_medium_random_ltd.json
│ ├── finetune
│ │ ├── ds_analyze_gpt_data_map.sh
│ │ ├── ds_analyze_gpt_data_reduce.sh
│ │ ├── ds_config_gpt2-medium_1clmetric_TEMPLATE.json
│ │ ├── ds_config_gpt2-medium_2clmetrics_TEMPLATE.json
│ │ ├── ds_config_gpt2_TEMPLATE.json
│ │ ├── ds_finetune_gpt2.sh
│ │ └── ds_finetune_gpt2_run.sh
│ ├── learning_rates.py
│ ├── requirement.txt
│ └── run_clm_no_trainer.py
├── variable_batch_size_and_lr
│ ├── README.md
│ ├── variable_attn_matrix.png
│ ├── variable_batch_lr.png
│ ├── variable_batch_lr_pipeline.png
│ └── variable_batch_size_and_lr_example.py
└── vit_finetuning
│ ├── README.md
│ ├── bash_script
│ ├── run_cifar_random_ltd.sh
│ └── run_imagenet_random_ltd.sh
│ ├── config
│ ├── ds_config_cifar_random_ltd.json
│ └── ds_config_imagenet_random_ltd.json
│ ├── main_cifar.py
│ ├── main_imagenet.py
│ ├── models
│ ├── __init__.py
│ └── vit.py
│ ├── requirement.txt
│ └── utils
│ ├── __init__.py
│ ├── get_data.py
│ └── utils.py
├── gan
├── gan_baseline_run.sh
├── gan_baseline_train.py
├── gan_deepspeed_config.json
├── gan_deepspeed_run.sh
├── gan_deepspeed_train.py
├── gan_model.py
└── utils.py
├── imagenet
├── README.md
├── assets
│ └── resnetplot.png
├── config
│ ├── ds_config.json
│ ├── ds_fp16_config.json
│ └── ds_fp16_z1_config.json
├── extract_ILSVRC.sh
├── main.py
├── requirements.txt
├── run_ds.sh
├── run_ds_fp16.sh
└── run_ds_fp16_z1.sh
├── megatron
└── README.md
├── offload_states
├── README.md
├── offload_states.py
├── output_table.py
└── run_benchmark.sh
├── pipeline_parallelism
├── alexnet.py
├── ds_config.json
├── run.sh
└── train.py
├── stable_diffusion
├── README.md
├── inf_txt2img_loop.py
├── local_pipeline_stable_diffusion.py
├── mytrainbash.sh
├── requirements.txt
└── train_sd_distil_lora.py
└── tensor_parallel
├── README.md
├── alpaca_data.json
├── configs
├── ds_config.json
└── ds_config_temp.json
├── requirements.txt
├── run.sh
├── train.py
├── train_bench_length.py
└── utils.py
/.github/workflows/formatting.yml:
--------------------------------------------------------------------------------
1 | name: Formatting
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'staging**'
7 | pull_request:
8 | branches:
9 | '**'
10 |
11 | concurrency:
12 | group: ${{ github.workflow }}-${{ github.ref }}
13 | cancel-in-progress: true
14 |
15 | jobs:
16 |
17 | # formatting and basic install on cpu-only machine
18 | formatting:
19 | runs-on: ubuntu-22.04
20 |
21 | steps:
22 | - uses: actions/checkout@v2
23 |
24 | - name: environment
25 | run: |
26 | which python
27 | python --version
28 | pip install pre-commit>=2.20.0
29 |
30 | - name: Formatting checks
31 | run: |
32 | pre-commit run --all-files
33 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "training/DeepSpeed-Domino/Megatron-LM"]
2 | path = training/DeepSpeed-Domino/Megatron-LM
3 | url = https://github.com/NVIDIA/Megatron-LM.git
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v1.2.3
4 | hooks:
5 | - id: trailing-whitespace
6 | exclude: "Megatron-LM/"
7 | files: ^applications/DeepSpeed-Chat/.+
8 | - id: check-yaml
9 | exclude: "Megatron-LM/"
10 | files: ^applications/DeepSpeed-Chat/.+
11 | - id: end-of-file-fixer
12 | exclude: "Megatron-LM/"
13 | files: ^applications/DeepSpeed-Chat/.+
14 |
15 |
16 | - repo: https://github.com/google/yapf
17 | rev: v0.32.0
18 | hooks:
19 | - id: yapf
20 | files: ^applications/DeepSpeed-Chat/.+
21 |
22 | - repo: https://github.com/pycqa/flake8
23 | rev: 4.0.1
24 | hooks:
25 | - id: flake8
26 | args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
27 | files: ^applications/DeepSpeed-Chat/.+
28 |
29 | - repo: local
30 | hooks:
31 | - id: check-license
32 | name: check-license
33 | entry: ./scripts/check-license.py
34 | language: script
35 | files: ^applications/DeepSpeed-Chat/.+\.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr|sh)$
36 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @tjruwase @ShadenSmith @awan-10 @minjiaz
2 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/1.3B-breakdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/1.3B-breakdown.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/Banner-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/Banner-benchmark.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/RLHF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/RLHF.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/democrat2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/democrat2.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ds-chat-single.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-chat-single.gif
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ds-chat.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-chat.gif
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ds-shiba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-shiba.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/e2e_RLHF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/e2e_RLHF.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/four_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/four_blocks.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ppo_trainer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ppo_trainer.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/reward_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/reward_function.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/shiba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/shiba.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/video/release_v3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/video/release_v3.mp4
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/chat.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | import argparse
7 | import subprocess
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--path",
12 | type=str,
13 | help="Directory containing trained actor model")
14 | parser.add_argument(
15 | "--max_new_tokens",
16 | type=int,
17 | default=128,
18 | help="Maximum new tokens to generate per response",
19 | )
20 | args = parser.parse_args()
21 |
22 | cmd = f"python3 ./inference/chatbot.py --path {args.path} --max_new_tokens {args.max_new_tokens}"
23 | p = subprocess.Popen(cmd, shell=True)
24 | p.wait()
25 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=2.8.0
2 | sentencepiece>=0.1.97
3 | protobuf==3.20.3
4 | accelerate>=0.15.0
5 | torch>=1.12.0
6 | deepspeed>=0.9.0
7 | transformers>=4.31.0,!=4.33.2
8 | tensorboard
9 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | # setup.py: install script for deepspeed_chat
8 | """
9 | to install deepspeed_chat and its dependencies for development work,
10 | run this cmd from the root directory:
11 | pip install -e .
12 | """
13 | import setuptools
14 |
15 | setuptools.setup(
16 | name="deepspeed-chat",
17 | version="0.1",
18 | url=
19 | "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat",
20 | include_package_data=True,
21 | packages=setuptools.find_packages(include=['dschat']),
22 | install_requires=[
23 | "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3",
24 | "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2",
25 | "transformers>=4.31.0,!=4.33.2", "tensorboard"
26 | ],
27 | extras_require={
28 | "azureml": [
29 | "azure-ml-component",
30 | "azureml-core",
31 | ],
32 | })
33 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | # You can provide two models to compare the performance of the baseline and the finetuned model
8 | export CUDA_VISIBLE_DEVICES=0
9 | python prompt_eval.py \
10 | --model_name_or_path_baseline XXX \
11 | --model_name_or_path_finetune XXX
12 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 |
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
4 | `` --model_name_or_path facebook/opt-1.3b`` to ``--model_name_or_path EleutherAI/gpt-j-6b ``.
5 |
6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
7 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output_step1_llama2_7b
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | --per_device_train_batch_size 4 \
21 | --per_device_eval_batch_size 4 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0. \
25 | --num_train_epochs 4 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --output_dir $OUTPUT \
34 | &> $OUTPUT/training.log
35 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output_step1_llama2_7b_lora
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | --per_device_train_batch_size 4 \
21 | --per_device_eval_batch_size 4 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0. \
25 | --num_train_epochs 4 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --lora_dim 128 \
34 | --lora_module_name "layers." \
35 | --output_dir $OUTPUT \
36 | &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/multi_node/run_66b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-66b \
20 | --per_device_train_batch_size 4 \
21 | --per_device_eval_batch_size 4 \
22 | --max_seq_len 512 \
23 | --learning_rate 1e-4 \
24 | --weight_decay 0.1 \
25 | --num_train_epochs 2 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --lora_dim 128 \
33 | --lora_module_name decoder.layers. \
34 | --deepspeed \
35 | --output_dir $OUTPUT \
36 | &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | # Note that usually LoRA needs to use larger learning rate
8 | OUTPUT=$1
9 | ZERO_STAGE=$2
10 | if [ "$OUTPUT" == "" ]; then
11 | OUTPUT=./output
12 | fi
13 | if [ "$ZERO_STAGE" == "" ]; then
14 | ZERO_STAGE=0
15 | fi
16 | mkdir -p $OUTPUT
17 |
18 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-1.3b \
19 | --gradient_accumulation_steps 8 --lora_dim 128 --zero_stage $ZERO_STAGE \
20 | --enable_tensorboard \
21 | --tensorboard_path $OUTPUT \
22 | --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
23 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | # Note that usually LoRA needs to use larger learning rate
8 | OUTPUT_PATH=./output
9 | mkdir -p $OUTPUT_PATH
10 |
11 | deepspeed --num_gpus 1 main.py \
12 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
13 | --data_split 2,4,4 \
14 | --model_name_or_path facebook/opt-6.7b \
15 | --per_device_train_batch_size 8 \
16 | --per_device_eval_batch_size 8 \
17 | --max_seq_len 512 \
18 | --learning_rate 1e-3 \
19 | --weight_decay 0. \
20 | --num_train_epochs 16 \
21 | --gradient_accumulation_steps 16 \
22 | --lr_scheduler_type cosine \
23 | --num_warmup_steps 0 \
24 | --seed 1234 \
25 | --gradient_checkpointing \
26 | --zero_stage 0 \
27 | --lora_dim 128 \
28 | --lora_module_name decoder.layers. \
29 | --deepspeed \
30 | --output_dir $OUTPUT_PATH \
31 | &> $OUTPUT_PATH/training.log
32 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=2
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-1.3b \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 8 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0. \
25 | --num_train_epochs 16 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --zero_stage $ZERO_STAGE \
31 | --deepspeed \
32 | --enable_tensorboard \
33 | --tensorboard_path $OUTPUT \
34 | --output_dir $OUTPUT \
35 | &> $OUTPUT/training.log
36 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | # Note that usually LoRA needs to use larger learning rate
8 | OUTPUT_PATH=./output
9 | mkdir -p $OUTPUT_PATH
10 |
11 | deepspeed main.py \
12 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
13 | --data_split 2,4,4 \
14 | --model_name_or_path facebook/opt-1.3b \
15 | --per_device_train_batch_size 8 \
16 | --per_device_eval_batch_size 8 \
17 | --max_seq_len 512 \
18 | --learning_rate 1e-3 \
19 | --weight_decay 0.1 \
20 | --num_train_epochs 16 \
21 | --gradient_accumulation_steps 1 \
22 | --lr_scheduler_type cosine \
23 | --num_warmup_steps 0 \
24 | --seed 1234 \
25 | --zero_stage 0 \
26 | --lora_dim 128 \
27 | --lora_module_name decoder.layers. \
28 | --only_optimize_lora \
29 | --deepspeed \
30 | --output_dir $OUTPUT_PATH \
31 | &> $OUTPUT_PATH/training.log
32 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_13b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-13b \
20 | --per_device_train_batch_size 4 \
21 | --per_device_eval_batch_size 4 \
22 | --max_seq_len 512 \
23 | --learning_rate 1e-4 \
24 | --weight_decay 0. \
25 | --num_train_epochs 16 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --lora_dim 128 \
33 | --lora_module_name decoder.layers. \
34 | --deepspeed \
35 | --output_dir $OUTPUT \
36 | &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_30b_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT_PATH=./output
7 | mkdir -p $OUTPUT_PATH
8 |
9 | deepspeed main.py \
10 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
11 | --data_split 2,4,4 \
12 | --model_name_or_path facebook/opt-30b \
13 | --per_device_train_batch_size 4 \
14 | --per_device_eval_batch_size 4 \
15 | --max_seq_len 512 \
16 | --learning_rate 9.65e-6 \
17 | --weight_decay 0. \
18 | --num_train_epochs 16 \
19 | --gradient_accumulation_steps 1 \
20 | --lr_scheduler_type cosine \
21 | --num_warmup_steps 0 \
22 | --seed 1234 \
23 | --lora_dim 128 \
24 | --gradient_checkpointing \
25 | --zero_stage 3 \
26 | --deepspeed \
27 | --output_dir $OUTPUT_PATH \
28 | &> $OUTPUT_PATH/training.log
29 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_6.7b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-6.7b \
20 | --per_device_train_batch_size 6 \
21 | --per_device_eval_batch_size 6 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0. \
25 | --num_train_epochs 16 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --output_dir $OUTPUT \
34 | &> $OUTPUT/training.log
35 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Characterization Script
2 |
3 | # Contents
4 | * [Introduction](#introduction)
5 | * [Usage](#usage)
6 |
7 | # Introduction
8 | The step 1 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
9 |
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | Lora: True, False
13 |
14 |
15 | The `run_step1_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
16 |
17 | # Usage
18 | The sweep script can be run as follows:
19 |
20 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning$ bash training_scripts/opt/single_node/sweep/run_step1_sweep.sh
21 |
22 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | for z in {2..3}
7 | do
8 | for offload in true false
9 | do
10 | for lora in true false
11 | do
12 | cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
13 | ${z} \
14 | ${offload} \
15 | ${lora} \
16 | z${z}_offload_${offload}_lora_${lora}"
17 | echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
18 | echo $cmd
19 | $cmd
20 | pkill -9 python
21 | sleep 60
22 | echo ""
23 | done
24 | done
25 | done
26 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/other_language/run_chinese.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=2
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | # The Chinese data we found mostly only contain one response without another
17 | # "rejected" response. Thus we only test the step 1 finetuning and use
18 | # a data_split of 10,0,0 (keep all data for step 1).
19 | deepspeed main.py \
20 | --data_path wangrui6/Zhihu-KOL Cohere/miracl-zh-queries-22-12 Hello-SimpleAI/HC3-Chinese mkqa-Chinese \
21 | --data_split 10,0,0 \
22 | --model_name_or_path bigscience/bloom-1b1 \
23 | --per_device_train_batch_size 8 \
24 | --per_device_eval_batch_size 8 \
25 | --max_seq_len 512 \
26 | --learning_rate 9.65e-6 \
27 | --weight_decay 0. \
28 | --num_train_epochs 16 \
29 | --gradient_accumulation_steps 1 \
30 | --lr_scheduler_type cosine \
31 | --num_warmup_steps 0 \
32 | --seed 1234 \
33 | --zero_stage $ZERO_STAGE \
34 | --deepspeed \
35 | --output_dir $OUTPUT \
36 | &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/other_language/run_japanese.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=2
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | # The Japanese data we found mostly only contain one response without another
17 | # "rejected" response. Thus we only test the step 1 finetuning and use
18 | # a data_split of 10,0,0 (keep all data for step 1).
19 | deepspeed main.py \
20 | --data_path mkqa-Japanese Cohere/miracl-ja-queries-22-12 lmqg/qg_jaquad lmqg/qag_jaquad \
21 | --data_split 10,0,0 \
22 | --model_name_or_path sberbank-ai/mGPT \
23 | --per_device_train_batch_size 8 \
24 | --per_device_eval_batch_size 8 \
25 | --max_seq_len 512 \
26 | --learning_rate 9.65e-6 \
27 | --weight_decay 0. \
28 | --num_train_epochs 16 \
29 | --gradient_accumulation_steps 1 \
30 | --lr_scheduler_type cosine \
31 | --num_warmup_steps 0 \
32 | --seed 1234 \
33 | --zero_stage $ZERO_STAGE \
34 | --deepspeed \
35 | --output_dir $OUTPUT \
36 | &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 |
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
4 | `` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``.
5 |
6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
7 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static \
18 | --data_split 2,4,4 \
19 | --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 8 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0.1 \
25 | --num_train_epochs 1 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --offload \
34 | --output_dir $OUTPUT \
35 | &> $OUTPUT/training.log
36 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static \
18 | --data_split 2,4,4 \
19 | --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 8 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0.1 \
25 | --num_train_epochs 1 \
26 | --gradient_accumulation_steps 1 \
27 | --lr_scheduler_type cosine \
28 | --num_warmup_steps 0 \
29 | --seed 1234 \
30 | --gradient_checkpointing \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --offload \
34 | --lora_dim 128 \
35 | --lora_module_name "layers." \
36 | --output_dir $OUTPUT \
37 | &> $OUTPUT/training.log
38 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-350m \
20 | --per_device_train_batch_size 2 \
21 | --per_device_eval_batch_size 2 \
22 | --max_seq_len 512 \
23 | --learning_rate 5e-5 \
24 | --weight_decay 0.1 \
25 | --dropout 0.0 \
26 | --num_train_epochs 1 \
27 | --gradient_accumulation_steps 1 \
28 | --lr_scheduler_type cosine \
29 | --num_warmup_steps 0 \
30 | --seed 1234 \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --output_dir $OUTPUT \
34 | &> $OUTPUT/training.log
35 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
17 | --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
18 | --enable_tensorboard \
19 | --tensorboard_path $OUTPUT \
20 | --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
21 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-350m \
20 | --per_device_train_batch_size 4 \
21 | --per_device_eval_batch_size 4 \
22 | --max_seq_len 512 \
23 | --learning_rate 5e-5 \
24 | --weight_decay 0.1 \
25 | --num_train_epochs 1 \
26 | --dropout 0.0 \
27 | --gradient_accumulation_steps 1 \
28 | --lr_scheduler_type cosine \
29 | --num_warmup_steps 0 \
30 | --seed 1234 \
31 | --zero_stage $ZERO_STAGE \
32 | --deepspeed \
33 | --output_dir $OUTPUT \
34 | &> $OUTPUT/training.log
35 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Characterization Script
2 |
3 | # Contents
4 | * [Introduction](#introduction)
5 | * [Usage](#usage)
6 |
7 | # Introduction
8 | The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
9 |
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 |
13 |
14 | The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
15 |
16 | # Usage
17 | The sweep script can be run as follows:
18 |
19 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
20 |
21 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | for z in {2..3}
7 | do
8 | for offload in true false
9 | do
10 | cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
11 | ${z} \
12 | ${offload} \
13 | z${z}_offload_${offload}"
14 | echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
15 | echo $cmd
16 | $cmd
17 | pkill -9 python
18 | sleep 60
19 | echo ""
20 | done
21 | done
22 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/evaluation_scripts/run_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | # Add the path to the finetuned model
8 | python rw_eval.py \
9 | --model_name_or_path
10 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 |
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
4 | `` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``.
5 |
6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
7 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static \
18 | --data_split 2,4,4 \
19 | --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 8 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0.1 \
25 | --num_padding_at_beginning 0 \
26 | --num_train_epochs 1 \
27 | --gradient_accumulation_steps 1 \
28 | --lr_scheduler_type cosine \
29 | --num_warmup_steps 0 \
30 | --seed 1234 \
31 | --gradient_checkpointing \
32 | --zero_stage $ZERO_STAGE \
33 | --deepspeed \
34 | --offload \
35 | --output_dir $OUTPUT \
36 | &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static \
18 | --data_split 2,4,4 \
19 | --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 8 \
22 | --max_seq_len 512 \
23 | --learning_rate 9.65e-6 \
24 | --weight_decay 0.1 \
25 | --num_padding_at_beginning 0 \
26 | --num_train_epochs 1 \
27 | --gradient_accumulation_steps 1 \
28 | --lr_scheduler_type cosine \
29 | --num_warmup_steps 0 \
30 | --seed 1234 \
31 | --gradient_checkpointing \
32 | --zero_stage $ZERO_STAGE \
33 | --deepspeed \
34 | --offload \
35 | --lora_dim 128 \
36 | --lora_module_name "layers." \
37 | --output_dir $OUTPUT \
38 | &> $OUTPUT/training.log
39 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/multi_node/run_350m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-350m \
20 | --num_padding_at_beginning 1 \
21 | --per_device_train_batch_size 2 \
22 | --per_device_eval_batch_size 2 \
23 | --max_seq_len 512 \
24 | --learning_rate 5e-5 \
25 | --weight_decay 0.1 \
26 | --dropout 0.0 \
27 | --num_train_epochs 1 \
28 | --gradient_accumulation_steps 1 \
29 | --lr_scheduler_type cosine \
30 | --num_warmup_steps 0 \
31 | --seed 1234 \
32 | --zero_stage $ZERO_STAGE \
33 | --deepspeed \
34 | --output_dir $OUTPUT \
35 | &> $OUTPUT/training.log
36 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
17 | --num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
18 | --enable_tensorboard \
19 | --tensorboard_path $OUTPUT \
20 | --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
21 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 | if [ "$OUTPUT" == "" ]; then
9 | OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 | ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 |
16 | deepspeed main.py \
17 | --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 | --data_split 2,4,4 \
19 | --model_name_or_path facebook/opt-350m \
20 | --num_padding_at_beginning 1 \
21 | --per_device_train_batch_size 4 \
22 | --per_device_eval_batch_size 4 \
23 | --max_seq_len 512 \
24 | --learning_rate 5e-5 \
25 | --weight_decay 0.1 \
26 | --num_train_epochs 1 \
27 | --dropout 0.0 \
28 | --gradient_accumulation_steps 1 \
29 | --lr_scheduler_type cosine \
30 | --num_warmup_steps 0 \
31 | --seed 1234 \
32 | --zero_stage $ZERO_STAGE \
33 | --deepspeed \
34 | --output_dir $OUTPUT \
35 | &> $OUTPUT/training.log
36 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Characterization Script
2 |
3 | # Contents
4 | * [Introduction](#introduction)
5 | * [Usage](#usage)
6 |
7 | # Introduction
8 | The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
9 |
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 |
13 |
14 | The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
15 |
16 | # Usage
17 | The sweep script can be run as follows:
18 |
19 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
20 |
21 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | for z in {2..3}
7 | do
8 | for offload in true false
9 | do
10 | cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
11 | ${z} \
12 | ${offload} \
13 | z${z}_offload_${offload}"
14 | echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
15 | echo $cmd
16 | $cmd
17 | pkill -9 python
18 | sleep 60
19 | echo ""
20 | done
21 | done
22 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 |
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply update
4 | ``` --actor_model_name_or_path ${step1_path} --critic_model_name_or_path ${step2_path} ```.
5 |
6 | If you don't have step 1 and step 2 models. You may simply try
7 | ``` bash
8 | --actor_model_name_or_path facebook/opt-1.3b --critic_model_name_or_path facebook/opt-350m
9 | ```
10 | ⚡⚡⚡ When you use above script, please make sure you modify parameter `rlhf_training` to False when calling the `create_critic_model` function twice in [rlhf_engine.py](./../../step3_rlhf_finetuning/rlhf_engine.py) such that it won't load the model weight from previous paths.
11 |
12 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
13 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | ACTOR_MODEL_PATH=$1
7 | CRITIC_MODEL_PATH=$2
8 | ACTOR_ZERO_STAGE=$3
9 | CRITIC_ZERO_STAGE=$4
10 | OUTPUT=$5
11 | if [ "$OUTPUT" == "" ]; then
12 | OUTPUT=./output
13 | fi
14 | if [ "$ACTOR_ZERO_STAGE" == "" ]; then
15 | ACTOR_ZERO_STAGE=0
16 | fi
17 | if [ "$CRITIC_ZERO_STAGE" == "" ]; then
18 | CRITIC_ZERO_STAGE=0
19 | fi
20 | mkdir -p $OUTPUT
21 |
22 | deepspeed --num_gpus 1 main.py \
23 | --actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \
24 | --actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \
25 | --num_padding_at_beginning 1 --gradient_accumulation_steps 2 \
26 | --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0 \
27 | --output_dir $OUTPUT &> $OUTPUT/training.log
28 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Characterization Script
2 |
3 | # Contents
4 | * [Introduction](#introduction)
5 | * [Usage](#usage)
6 |
7 | # Introduction
8 | The step 3 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
9 |
10 | Zero Stage: 2, 3
11 | Hybrid Engine: True, False
12 | Offload: True, False
13 | Lora: True, False
14 |
15 |
16 | The `run_step3_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
17 |
18 | # Usage
19 | The sweep script can be run as follows:
20 |
21 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/opt/single_node/sweep/run_step3_sweep.sh
22 |
23 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/banner.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/ceos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/ceos.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/friends.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/friends.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/hero-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/hero-figure.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/model.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/chat/README.md:
--------------------------------------------------------------------------------
1 | We provide a CLI interface for uses to test their trained chat model. First of all, please note that you need to provide both the trained checkpoint and the original language model & vision encoder paths. The model is first initialized and then loads the trained checkpoint. Also, please note that if you used multi-modal causal attention during your training, remember to put --enable_mmca_attention for in your chat script.
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/chat/chat_scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | MAIN_PATH=$1
7 |
8 | VISION_ENCODER=/blob/transformers_cache/qwen-clip
9 | LLM=/blob/transformers_cache/Llama-2-13b-hf
10 |
11 | export CUDA_VISIBLE_DEVICES=0 # Do multi single evaluation
12 | # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # Do multi gpu evaluation for large models (single GPU is not enough)
13 |
14 |
15 | python chat.py \
16 | --lm_model_name_or_path $LLM \
17 | --vision_model_name_or_path $VISION_ENCODER \
18 | --checkpoint_path $MAIN_PATH --enable_mmca_attention
19 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/eval_single.json:
--------------------------------------------------------------------------------
1 | {
2 | "cat_images1": [["please describe the image", "./eval/eval_data/images/cats/cat.png"]],
3 | "cat_images2": [["can you describe the image", "./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg"]],
4 | "cat_images3": [["please describe the image", "./eval/eval_data/images/cats/british_shorthair.jpg"]],
5 | "extreme_ironing": [["What is unusual about this image?", "./eval/eval_data/images/singles/extreme_ironing.jpg"]],
6 | "waterview": [["What are the things I should be cautious about when I visit here?", "./eval/eval_data/images/singles/waterview.jpg"]],
7 | "art-dog": [["can you describe the image", "./eval/eval_data/images/singles/202160027_b319c4166e.jpg"]],
8 | "funny-phone": [["What is funny about this image? Describe it panel by panel.", "./eval/eval_data/images/singles/1.jpg"]],
9 | "squirrel": [["Why would a person find this image funny?", "./eval/eval_data/images/singles/2.jpg"]],
10 | "art-painting": [["Tell me about this work of art.", "./eval/eval_data/images/singles/50.jpg"]]
11 | }
12 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_scripts/run_batch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 | # DeepSpeed Team
5 |
6 | #EVAL_DATSET=eval_robustness eval_single eval_comprehensive (see the json in the folder ./eval_data/*.json)
7 | MAIN_PATH=$1
8 | VISION_MODEL=/blob/transformers_cache/qwen-clip #openai/clip-vit-large-patch14
9 | LLM=/blob/transformers_cache/Llama-2-13b-hf #meta-llama/Llama-2-7b
10 | for EVAL_DATSET in eval_single eval_comprehensive eval_robustness
11 | do
12 | SAVE_PATH=eval/results/${EVAL_DATSET}
13 | mkdir ${SAVE_PATH}
14 | for CKPT_NAME in final best_eval
15 | do
16 | #NOTE: to run multi-GPU, you simple do "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7;"
17 | export CUDA_VISIBLE_DEVICES=0; python eval/batch_generation.py --model_name dsvl --vis_proj baseline --max_seq_len 4096 \
18 | --lm_model_name_or_path ${LLM} --vision_model_name_or_path ${VISION_MODEL} \
19 | --checkpoint_path $MAIN_PATH --checkpoint_names $CKPT_NAME --eval_data ${EVAL_DATSET} \
20 | --enable_mmca_attention --output_filename ${SAVE_PATH}/ours_${CKPT_NAME} &> ${SAVE_PATH}/ours_${CKPT_NAME}.log
21 | done
22 | done
23 |
24 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/helper/README.md:
--------------------------------------------------------------------------------
1 | # QWen-VL's Vision Encoder
2 | The extract_qwen_vl.py can be used to extract the vision encoder from QWen-VL. After extraction, you can find other necessary files in the [folder](./qwen_clip).
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/helper/extract_qwen_vl.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM
2 | import torch
3 |
4 | PATH = "Qwen/Qwen-VL-Chat"
5 |
6 | model = AutoModelForCausalLM.from_pretrained(PATH, device_map="cuda", trust_remote_code=True).eval()
7 |
8 | state_dict = model.state_dict()
9 | save_dict = {}
10 | for k,v in state_dict.items():
11 | if 'visual' in k:
12 | if 'transformer.visual.proj' not in k: # we don't need the proj layer
13 | save_dict[k.replace('transformer.visual.', '')] = v
14 | torch.save(save_dict, './qwen_clip/pytorch_model.bin')
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/helper/qwen_clip/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "crop_size": 448,
3 | "do_center_crop": true,
4 | "do_normalize": true,
5 | "do_resize": true,
6 | "feature_extractor_type": "CLIPFeatureExtractor",
7 | "image_mean": [
8 | 0.48145466,
9 | 0.4578275,
10 | 0.40821073
11 | ],
12 | "image_std": [
13 | 0.26862954,
14 | 0.26130258,
15 | 0.27577711
16 | ],
17 | "resample": 3,
18 | "size": 448
19 | }
20 |
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=2.8.0
2 | sentencepiece>=0.1.97
3 | protobuf==3.20.3
4 | accelerate>=0.15.0
5 | open_clip_torch
6 | deepspeed>=0.10.3
7 | einops
8 | einops_exts
9 | transformers==4.33.3
10 | transformers_stream_generator
11 | termcolor
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/utils/data/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
2 |
3 | from .builder import build_dataset # noqa: F401
4 | from .vqa_dataset import VQADataset # noqa: F401
5 | from .utils import DataCollatorPadToMaxLen, split_dataset, shuffle_dataset # noqa: F401
6 | from .DST import add_special_token
--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/utils/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling_dsvl import create_dsvl_model_and_transforms
--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | All benchmarks that use the DeepSpeed library are maintained in this folder. We welcome contributions in this space!
2 |
--------------------------------------------------------------------------------
/benchmarks/communication/__init__.py:
--------------------------------------------------------------------------------
1 | '''Copyright The Microsoft DeepSpeed Team'''
--------------------------------------------------------------------------------
/benchmarks/communication/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | from deepspeed.accelerator import get_accelerator
7 |
8 | DEFAULT_WARMUPS = 5
9 | DEFAULT_TRIALS = 50
10 | DEFAULT_TYPE = 'float'
11 | DEFAULT_BACKEND = get_accelerator().communication_backend_name()
12 | DEFAULT_UNIT = 'Gbps'
13 | DEFAULT_DIST = 'deepspeed'
14 | DEFAULT_MAXSIZE = 24
15 | DEFAULT_DEVICE = 'cuda'
16 | TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
17 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | *.pyc
3 | *.png
4 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/ddp_config.yaml.template:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: MULTI_GPU
4 | machine_rank: {{ machine_rank }}
5 | main_training_function: main
6 | mixed_precision: bf16
7 | num_machines: {{ num_machines }}
8 | num_processes: {{ num_processes }}
9 | rdzv_backend: static
10 | same_network: true
11 | tpu_env: []
12 | tpu_use_cluster: false
13 | tpu_use_sudo: false
14 | use_cpu: false
15 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/ds_config.json.template:
--------------------------------------------------------------------------------
1 | {
2 | {% if fp16 %}
3 | "fp16": {
4 | "enabled": true,
5 | "initial_scale_power": 8
6 | },
7 | {% else %}
8 | "bf16": {
9 | "enabled": true
10 | },
11 | {% endif %}
12 | "zero_optimization": {
13 | "stage": {{ zero_stage }},
14 | "sub_group_size": 100000000
15 | },
16 | "compile": {
17 | "deepcompile": {{ deepcompile }},
18 | "offload_activation": false,
19 | "offload_opt_states": false,
20 | "double_buffer": true,
21 | "symmetric_memory": false,
22 | "free_activation": false,
23 | "debug_log": {{ debug_log }},
24 | "sync_before_reduce": {{ sync_before_reduce }},
25 | "sync_after_reduce": {{ sync_after_reduce }}
26 | },
27 | "gradient_accumulation_steps": {{ gradient_accumulation_steps }},
28 | "gradient_clipping": "auto",
29 | "steps_per_print": 2000,
30 | "train_batch_size": "auto",
31 | "train_micro_batch_size_per_gpu": "auto",
32 | "wall_clock_breakdown": false
33 | }
--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/ds_config.yaml.template:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | {%- if zero_stage == 3 %}
6 | zero3_init_flag: true
7 | {%- endif %}
8 | deepspeed_config_file: configs/ds_config.json
9 | distributed_type: DEEPSPEED
10 | machine_rank: {{ machine_rank }}
11 | main_training_function: main
12 | num_machines: {{ num_machines }}
13 | num_processes: {{ num_processes }}
14 | rdzv_backend: static
15 | same_network: true
16 | tpu_env: []
17 | tpu_use_cluster: false
18 | tpu_use_sudo: false
19 | use_cpu: false
--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/fsdp_config.yaml.template:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: FSDP
4 | fsdp_config:
5 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
6 | fsdp_backward_prefetch: BACKWARD_PRE
7 | fsdp_cpu_ram_efficient_loading: true
8 | fsdp_forward_prefetch: false
9 | fsdp_offload_params: false
10 | {%- if zero_stage == 3 %}
11 | fsdp_sharding_strategy: FULL_SHARD
12 | {%- else %}
13 | fsdp_sharding_strategy: SHARD_GRAD_OP
14 | {%- endif %}
15 | fsdp_state_dict_type: SHARDED_STATE_DICT
16 | fsdp_sync_module_states: true
17 | fsdp_use_orig_params: true
18 | machine_rank: {{ machine_rank }}
19 | main_training_function: main
20 | mixed_precision: bf16
21 | num_machines: {{ num_machines }}
22 | num_processes: {{ num_processes }}
23 | rdzv_backend: static
24 | same_network: true
25 | tpu_env: []
26 | tpu_use_cluster: false
27 | tpu_use_sudo: false
28 | use_cpu: false
29 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/singlegpu_config.yaml.template:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: NO
4 | main_training_function: main
5 | mixed_precision: bf16
6 | use_cpu: false
7 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/hostfile_n4:
--------------------------------------------------------------------------------
1 | node-0 slots=8
2 | node-1 slots=8
3 | node-2 slots=8
4 | node-3 slots=8
5 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs1.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs2.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs4.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs2.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs4.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Llama-3-70B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Llama-3-70B_np32_bs1.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png
--------------------------------------------------------------------------------
/benchmarks/deepcompile/run_bench_offload.sh:
--------------------------------------------------------------------------------
1 | PROFILE_DIR=${PROFILE_DIR:-"profile_offload"}
2 | mkdir -p ${PROFILE_DIR}
3 | PROFILE_OPTS="--profile --profile-dir ${PROFILE_DIR}"
4 | COMPILE_OPTS="--compile"
5 | DC_OPTS="--compile --deepcompile"
6 | ACC_OPTS="--gradient-accumulation-steps 1"
7 | AC_OPTS="--activation-checkpointing"
8 |
9 | mkdir -p logs
10 |
11 | export LOG_BASE="logs_offload"
12 | mkdir -p ${LOG_BASE}
13 |
14 | MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
15 | BATCH_SIZE_OPTS=(1)
16 | SEQ_LENGTH_OPTS=(1024)
17 | for BATCH_SIZE in ${BATCH_SIZE_OPTS[@]}; do
18 | for SEQ_LENGTH in ${SEQ_LENGTH_OPTS[@]}; do
19 | ARGS="--model ${MODEL} --batch-size ${BATCH_SIZE} --seq-length ${SEQ_LENGTH} ${ACC_OPTS} ${AC_OPTS} ${PROFILE_OPTS}"
20 | bash ./run.sh --backend deepspeed ${ARGS} --zero-stage 3
21 | bash ./run.sh --backend deepspeed ${ARGS} --zero-stage 3 --ds-offload
22 | bash ./run.sh --backend deepspeed ${ARGS} ${DC_OPTS} --zero-stage 3 --eager --passes offload_adam_states
23 | bash ./run.sh --backend deepspeed ${ARGS} ${DC_OPTS} --zero-stage 3 --eager --passes offload_adam_states_sync
24 | done
25 | done
26 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/run_bench_z1.sh:
--------------------------------------------------------------------------------
1 | PROFILE_DIR=${PROFILE_DIR:-profiles}
2 | mkdir -p ${PROFILE_DIR}
3 | PROFILE_OPTS="--profile --profile-dir ${PROFILE_DIR}"
4 | COMPILE_OPTS="--compile"
5 | DC_OPTS="--compile --deepcompile"
6 | ACC_OPTS="--gradient-accumulation-steps 1"
7 | AC_OPTS="--activation-checkpointing"
8 |
9 | export NUM_NODES=${NUM_NODES:-4}
10 |
11 | MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
12 | BATCH_SIZE_OPTS=(1 2 4)
13 | SEQ_LENGTH_OPTS=(512 1024 2048)
14 | for BATCH_SIZE in ${BATCH_SIZE_OPTS[@]}; do
15 | for SEQ_LENGTH in ${SEQ_LENGTH_OPTS[@]}; do
16 | ARGS="--model ${MODEL} --batch-size ${BATCH_SIZE} --seq-length ${SEQ_LENGTH} --zero-stage 1 ${ACC_OPTS} ${AC_OPTS}"
17 | bash ./run_multinode.sh --backend deepspeed ${ARGS}
18 | bash ./run_multinode.sh --backend deepspeed ${ARGS} ${COMPILE_OPTS}
19 | bash ./run_multinode.sh --backend deepspeed ${ARGS} ${DC_OPTS}
20 |
21 | cp -r logs ${PROFILE_DIR}/
22 | done
23 | done
24 |
--------------------------------------------------------------------------------
/benchmarks/deepcompile/run_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo $*
4 |
5 | SCRIPT_DIR=$(dirname $(realpath $0))
6 | HOST_IP=$(hostname -i)
7 | NUM_NODES=${NUM_NODES:-1}
8 |
9 | # verify that NUM_NODES is a positive integer
10 | if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then
11 | echo "Error: NUM_NODES must be a positive integer"
12 | exit 1
13 | fi
14 |
15 | # check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists
16 | if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then
17 | echo "Error: hostfile_n${NUM_NODES} does not exist"
18 | exit 1
19 | fi
20 |
21 | if [ "${NUM_NODES}" == "1" ]; then
22 | # avoid dependency on pdsh when possible
23 | cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*
24 | else
25 | ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*"
26 | fi
27 |
--------------------------------------------------------------------------------
/benchmarks/inference/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/128k-120.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 128000
2 | prompt_length_var: 0.1
3 | max_prompt_length: 131072
4 | max_new_tokens: 120
5 | max_new_tokens_var: 0.3
6 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/1300-120.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 1300
2 | prompt_lenght_var: 0.3
3 | max_new_tokens: 120
4 | max_new_tokens_var: 0.3
5 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/2600-60.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 2600
2 | prompt_lenght_var: 0.3
3 | max_new_tokens: 60
4 | max_new_tokens_var: 0.3
5 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/500-500.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 500
2 | prompt_lenght_var: 0.3
3 | max_new_tokens: 500
4 | max_new_tokens_var: 0.3
5 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 | [project]
5 | name = "deepspeedometer"
6 | version = "0.0.1"
7 | authors = [
8 | { name="Ammar Ahmad Awan", email="ammar.awan@microsoft.com" },
9 | { name="Arash Bakhitiari", email="abakhtiari@microsoft.com" },
10 | { name="Connor Holmes"},
11 | { name="Lev Kurilenko", email="lev.kurilenko@microsoft.com" },
12 | { name="Heyang Qin", email="heyangqin@microsoft.com" },
13 | { name="Masahiro Tanaka", email="mtanaka@microsoft.com" },
14 | { name="Michael Wyatt", email="michaelwyatt@microsoft.com" },
15 | ]
16 | description = "LLM benchmarking tool"
17 | readme = "README.md"
18 | requires-python = ">=3.8"
19 | classifiers = [
20 | "Programming Language :: Python :: 3",
21 | ]
22 | dependencies = [
23 | "loguru",
24 | "pydantic>=2.0.0",
25 | "torch",
26 | "tqdm",
27 | "transformers",
28 | ]
29 |
30 | [project.urls]
31 | Homepage = "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference/deepspeedometer"
32 | Issues = "https://github.com/deepspeedai/DeepSpeedExamples/issues"
33 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/run_example.sh:
--------------------------------------------------------------------------------
1 | python -m src.deepspeedometer.benchmark_runner --model "facebook/opt-125m" --api dummy --config_file ./configs/1300-120.yaml
2 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py:
--------------------------------------------------------------------------------
1 | from .arg_parsing import parse_args_to_configs
2 | from .benchmark_runner import BenchmarkRunner
3 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseClient
2 |
3 | from .azure_ml_client import AzureMLClientConfig, AzureMLClient
4 | from .dummy_client import DummyClientConfig, DummyClient
5 | from .fastgen_client import FastGenClientConfig, FastGenClient
6 | from .vllm_client import vLLMClientConfig, vLLMClient
7 | from .openai_client import openaiClientConfig, openaiClient
8 |
9 | client_config_classes = {
10 | "dummy": DummyClientConfig,
11 | "azure_ml": AzureMLClientConfig,
12 | "fastgen": FastGenClientConfig,
13 | "vllm": vLLMClientConfig,
14 | "openai": openaiClientConfig
15 | }
16 | client_classes = {
17 | "dummy": DummyClient,
18 | "azure_ml": AzureMLClient,
19 | "fastgen": FastGenClient,
20 | "vllm": vLLMClient,
21 | "openai": openaiClient,
22 | }
23 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any, Dict
3 |
4 | from ..config import BaseConfigModel
5 | from ..prompt import Prompt
6 |
7 |
8 | class BaseClient(ABC):
9 | def __init__(self, config: BaseConfigModel) -> None:
10 | self.config = config
11 |
12 | @abstractmethod
13 | def start_service(self) -> None:
14 | pass
15 |
16 | @abstractmethod
17 | def stop_service(self) -> None:
18 | pass
19 |
20 | @abstractmethod
21 | def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
22 | pass
23 |
24 | @abstractmethod
25 | def send_request(self, request_kwargs: Dict[str, Any]) -> Any:
26 | pass
27 |
28 | @abstractmethod
29 | def process_response(self, raw_response: Any) -> str:
30 | pass
31 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel, ConfigDict
2 |
3 |
4 | class BaseConfigModel(BaseModel):
5 | model_config = ConfigDict(
6 | validate_default=True,
7 | validate_assignment=False,
8 | use_enum_values=True,
9 | populate_by_name=True,
10 | extra="forbid",
11 | arbitrary_types_allowed=True,
12 | protected_namespaces=(),
13 | )
14 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 | from typing import Any
3 |
4 |
5 | @dataclass
6 | class Response:
7 | prompt_text: str = ""
8 | prompt_tokens: int = 0
9 | generated_output: str = ""
10 | generated_tokens: int = 0
11 | request_time: float = 0
12 | raw_response: Any = None
13 | client_id: int = 0
14 |
15 | def to_dict(self) -> dict:
16 | return asdict(self)
17 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/README.md:
--------------------------------------------------------------------------------
1 | To run the unit tests:
2 |
3 | `python3 -m pytest .`
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/inference/deepspeedometer/tests/__init__.py
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_benchmark.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from deepspeedometer import parse_args_to_configs, BenchmarkRunner
4 |
5 |
6 | def test_benchmark_runner(benchmark_args, num_clients):
7 | benchmark_config, client_config = parse_args_to_configs(benchmark_args)
8 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
9 | benchmark_runner.run()
10 |
11 | expected_results = sum(1 for _ in benchmark_runner._benchmark_settings()) * len(
12 | num_clients
13 | )
14 | actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json")))
15 | assert (
16 | expected_results == actual_results
17 | ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})."
18 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_config.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import yaml
4 |
5 | import pydantic
6 |
7 | from deepspeedometer import BenchmarkRunner, parse_args_to_configs
8 |
9 |
10 | def test_config(benchmark_args):
11 | benchmark_config, client_config = parse_args_to_configs(benchmark_args)
12 |
13 |
14 | @pytest.mark.parametrize("model", [""])
15 | def test_config_required_fail(benchmark_args):
16 | with pytest.raises(pydantic.ValidationError):
17 | benchmark_config, client_config = parse_args_to_configs(benchmark_args)
18 |
19 |
20 | @pytest.mark.parametrize("num_config_files", [1])
21 | def test_config_file(benchmark_args, config_files, num_clients):
22 | # Create a config that would generate 6 benchmark settings
23 | config = {"max_prompt_length": [500, 1300, 2600], "num_clients": [1, 2]}
24 | with open(config_files[0], "w") as f:
25 | yaml.dump(config, f)
26 |
27 | benchmark_config, client_config = parse_args_to_configs(benchmark_args)
28 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
29 | benchmark_settings = sum(1 for _ in benchmark_runner._benchmark_settings()) * len(
30 | num_clients
31 | )
32 | assert benchmark_settings == 6
33 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_early_stop.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from deepspeedometer import parse_args_to_configs, BenchmarkRunner
4 |
5 |
6 | @pytest.mark.parametrize("num_clients", [(1, 2, 4)], indirect=True)
7 | def test_early_stop(benchmark_args):
8 | benchmark_args += [
9 | "--early_stop_latency",
10 | "1",
11 | "--dummy_client_latency_time",
12 | "2.0",
13 | ]
14 | print(benchmark_args)
15 | benchmark_config, client_config = parse_args_to_configs(benchmark_args)
16 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
17 | benchmark_runner.run()
18 |
19 | expected_results = 1
20 | actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json")))
21 | assert (
22 | expected_results == actual_results
23 | ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})."
24 |
--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_prompt.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from deepspeedometer import BenchmarkRunner, parse_args_to_configs
4 |
5 |
6 | @pytest.mark.parametrize("prompt_length_var, max_new_tokens_var", [(0, 0)])
7 | def test_prompt_length(benchmark_args):
8 | benchmark_config, client_config = parse_args_to_configs(benchmark_args)
9 | benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
10 | num_clients, prompt_config = next(benchmark_runner._benchmark_settings())
11 |
12 | for prompt in benchmark_runner.prompt_generator(prompt_config, num_prompts=10):
13 | prompt_length = benchmark_runner.prompt_generator.count_tokens(prompt.text)
14 | # Using pytest.approx here because often we will have 1-off errors due to tokenization special tokens
15 | assert prompt_length == pytest.approx(benchmark_runner.config.prompt_length, 1)
16 |
--------------------------------------------------------------------------------
/benchmarks/inference/mii/A6000_benchmarks_example.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/inference/mii/A6000_benchmarks_example.PNG
--------------------------------------------------------------------------------
/benchmarks/inference/mii/plot_config.yaml:
--------------------------------------------------------------------------------
1 | label: "vLLM"
2 | color: "purple"
3 | marker: "o"
4 | linestyle: "--"
5 | polyfit_degree: 0
6 | x_max : 30
7 | y_max : 10
8 |
--------------------------------------------------------------------------------
/benchmarks/inference/mii/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | matplotlib
3 | deepspeed-mii>=0.2.0
4 | vllm>=0.2.7
5 | numpy
6 | tabulate
7 |
--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_all.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
7 |
8 | for MODEL in ${MODELS[@]}; do
9 | python ./run_benchmark.py --model ${MODEL} --stream --backend fastgen
10 | python ./run_benchmark.py --model ${MODEL} --stream --backend vllm
11 | done
12 |
13 | # Extra runs for Mixtral with non-default settings
14 | python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend fastgen
15 | python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend vllm
--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_aml.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | # Run benchmark against AML endpoint
7 | python ./run_benchmark.py \
8 | --model \
9 | --deployment_name \
10 | --aml_api_url \
11 | --aml_api_key \
12 | --mean_prompt_length 2600 \
13 | --mean_max_new_tokens 60 \
14 | --num_requests 256 \
15 | --backend aml
16 |
17 | ### Gernerate the plots
18 | python ./src/plot_th_lat.py
19 |
20 | echo "Find figures in ./plots/ and log outputs in ./results/"
--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_example.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | # Run benchmark
7 | python ./run_benchmark.py \
8 | --model meta-llama/Llama-2-7b-hf \
9 | --tp_size 1 \
10 | --num_replicas 1 \
11 | --max_ragged_batch_size 768 \
12 | --mean_prompt_length 2600 \
13 | --mean_max_new_tokens 60 \
14 | --stream \
15 | --backend fastgen \
16 |
17 | ### Gernerate the plots
18 | python ./src/plot_th_lat.py
19 |
20 | echo "Find figures in ./plots/ and log outputs in ./results/"
--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_fp6.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | MODELS=(NousResearch/Llama-2-70b-hf)
7 |
8 | for MODEL in ${MODELS[@]}; do
9 | python ./run_benchmark.py --model ${MODEL} --num_requests 128 --stream --backend fastgen --fp6 --tp_size 1
10 | done
--------------------------------------------------------------------------------
/benchmarks/inference/mii/src/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
--------------------------------------------------------------------------------
/benchmarks/inference/mii/src/random_query_generator.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 |
6 | import numpy as np
7 | import torch
8 | import random
9 |
10 |
11 | class RandomQueryGenerator:
12 | def __init__(self, input_text, tokenizer, seed):
13 | self.input_text = input_text
14 | self.tokenizer = tokenizer
15 |
16 | torch.manual_seed(seed)
17 | random.seed(seed)
18 | np.random.seed(seed)
19 |
20 | def get_random_request_text(self, length, variance, max_length, batch):
21 | request_text = []
22 | tokenized_input = self.tokenizer.batch_encode_plus(
23 | [self.input_text], return_tensors="pt", padding=False
24 | )
25 | offset = list(range(512))
26 | random.shuffle(offset)
27 |
28 | text_ids = tokenized_input["input_ids"][0]
29 | for i in range(batch):
30 | # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens
31 | req_prompt_length = min(int(np.random.normal(length, variance)), max_length)
32 |
33 | text = self.tokenizer.decode(text_ids[i : req_prompt_length + i])
34 | request_text.append(text)
35 | return request_text
36 |
--------------------------------------------------------------------------------
/benchmarks/inference/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.21.3
2 |
--------------------------------------------------------------------------------
/benchmarks/inference/run_model.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | model=$1
4 | dtype=$2
5 | graphs=$3
6 | kernel=$4
7 | gpus=$5
8 |
9 | version=0
10 | log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
11 | mkdir -p ${log_path}
12 |
13 | params="--dtype $dtype "
14 | if [[ "$graphs" == "true" ]]; then
15 | params+="--graphs "
16 | fi
17 | if [[ "$kernel" == "true" ]]; then
18 | params+="--kernel "
19 | fi
20 |
21 | echo "baseline $log_path"
22 | deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
23 |
24 | echo "deepspeed $log_path"
25 | deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/deepspeed.log
--------------------------------------------------------------------------------
/compression/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Model Compression examples
2 |
3 | Examples in this folder are helpful to try out some features and models that take advantage of the DeepSpeed compression library.
4 |
5 | A detailed tutorial for understanding and using DeepSpeed model compression features can be seen from here: https://www.deepspeed.ai/tutorials/model-compression/
6 |
--------------------------------------------------------------------------------
/compression/bert/README.md:
--------------------------------------------------------------------------------
1 | #### Install
2 |
3 | ``pip install -r requirements.txt``
4 |
5 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library.
6 |
7 | #### Key File: run_glue_no_trainer.py
8 |
9 | The python code is modified based on [HuggingFace's PyTorch text_classification](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification). The key added feature is the implementation of knowledge distillation (KD)(--distill_method one_stage). If no KD, run (--distill_method zero_stage).
10 |
11 | #### Folders (config, huggingface_transformer, bash_script)
12 |
13 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction.
14 | * **huggingface_transformer:** This folder serves the implementation of knowledge distillation. It's based on [HuggingFace's transformer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
15 | The change is line 383, where we output attention_scores instead of attention_prob.
16 | * **bash_script** This folder contains many bash scripts for various kinds of compression. See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/).
17 |
18 |
--------------------------------------------------------------------------------
/compression/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | transformers == 4.15.0
3 | datasets >= 1.8.0
4 | sentencepiece != 0.1.92
5 | scipy
6 | scikit-learn
7 | protobuf
8 | gpustat
9 | torch >= 1.3
10 |
--------------------------------------------------------------------------------
/compression/cifar/README.md:
--------------------------------------------------------------------------------
1 | #### Install
2 |
3 | ``pip install torch torchvision``
4 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library.
5 |
6 | #### Key File: train.py
7 |
8 | The python code is modified based on (https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar). The key added feature is the compression pipeline.
9 |
10 | #### Folders (config)
11 |
12 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction.
13 |
14 | #### bash script
15 | * **run_compress.sh** This bash script contains jobs for training a checkpoint and then compressing this checkpoint. See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/).
16 |
17 |
--------------------------------------------------------------------------------
/compression/cifar/config/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 32,
3 | "train_micro_batch_size_per_gpu": 32,
4 | "steps_per_print": 50,
5 |
6 | "optimizer": {
7 | "type": "Adam",
8 | "params": {
9 | "lr": 0.001,
10 | "betas": [
11 | 0.8,
12 | 0.999
13 | ],
14 | "eps": 1e-8,
15 | "weight_decay": 3e-7
16 | }
17 | },
18 |
19 | "zero_optimization": {
20 | "stage": 0
21 | },
22 |
23 | "fp16":{
24 | "enabled": true
25 | },
26 |
27 | "gradient_clipping": 1.0,
28 | "prescale_gradients": true,
29 |
30 | "wall_clock_breakdown" : false
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/compression/gpt2/README.md:
--------------------------------------------------------------------------------
1 | #### Install
2 |
3 | ``pip install -r requirements.txt``
4 |
5 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library.
6 |
7 |
8 | #### Key File: run_clm_no_trainer.py
9 |
10 | The python code is modified based on huggingface (https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py). The key added feature is the compression pipeline.
11 |
12 | #### Folders (config)
13 |
14 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction.
15 |
16 | #### bash script
17 | * **run_zero_quant.sh** This bash script contains jobs for training a checkpoint and then compressing this checkpoint. Run the job under the gpt2 directory:
18 |
19 | ```DeepSpeedExamples/model_compression/gpt2$ . ./bash_script/run_zero_quant.sh```
20 | See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/).
21 |
22 |
--------------------------------------------------------------------------------
/compression/gpt2/config/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 8,
3 | "train_micro_batch_size_per_gpu": 4,
4 | "steps_per_print": 50,
5 |
6 | "optimizer": {
7 | "type": "Adam",
8 | "params": {
9 | "lr": 0.001,
10 | "betas": [
11 | 0.8,
12 | 0.999
13 | ],
14 | "eps": 1e-8,
15 | "weight_decay": 3e-7
16 | }
17 | },
18 |
19 | "zero_optimization": {
20 | "stage": 0
21 | },
22 |
23 | "fp16":{
24 | "enabled": true
25 | },
26 |
27 | "gradient_clipping": 1.0,
28 | "prescale_gradients": true,
29 |
30 | "wall_clock_breakdown" : false
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/compression/gpt2/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.8.0
2 | sentencepiece != 0.1.92
3 | protobuf
4 | transformers == 4.15.0
5 | accelerate
--------------------------------------------------------------------------------
/deepnvme/file_access/aio_load_gpu_tensor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os, timeit, functools
3 | from deepspeed.ops.op_builder import AsyncIOBuilder
4 | from utils import parse_read_arguments, GIGA_UNIT
5 |
6 | def file_read(inp_f, handle, bounce_buffer):
7 | handle.sync_pread(bounce_buffer, inp_f)
8 | return bounce_buffer.cuda()
9 |
10 |
11 | def main():
12 | args = parse_read_arguments()
13 | input_file = args.input_file
14 | file_sz = os.path.getsize(input_file)
15 | cnt = args.loop
16 |
17 | aio_handle = AsyncIOBuilder().load().aio_handle()
18 | bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
19 |
20 | t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
21 | aio_t = t.timeit(cnt)
22 | aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t
23 | print(f'aio load_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec')
24 |
25 | if args.validate:
26 | from py_load_cpu_tensor import file_read as py_file_read
27 | aio_tensor = file_read(input_file, aio_handle, bounce_buffer).cpu()
28 | py_tensor = py_file_read(input_file)
29 | print(f'Validation success = {aio_tensor.equal(py_tensor)}')
30 |
31 | if __name__ == "__main__":
32 | main()
33 |
--------------------------------------------------------------------------------
/deepnvme/file_access/media/deepnvme_ops_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/file_access/media/deepnvme_ops_report.png
--------------------------------------------------------------------------------
/deepnvme/file_access/py_load_cpu_tensor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os, timeit, functools
3 | from utils import parse_read_arguments, GIGA_UNIT
4 |
5 | def file_read(inp_f):
6 | with open(inp_f, 'rb') as f:
7 | tensor = torch.frombuffer(f.read(), dtype=torch.uint8)
8 | return tensor
9 |
10 | def main():
11 | args = parse_read_arguments()
12 | input_file = args.input_file
13 | file_sz = os.path.getsize(input_file)
14 | cnt = args.loop
15 |
16 | t = timeit.Timer(functools.partial(file_read, input_file))
17 | py_t = t.timeit(cnt)
18 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
19 | print(f'py load_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/deepnvme/file_access/py_load_gpu_tensor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os, timeit, functools
3 | from utils import parse_read_arguments, GIGA_UNIT
4 |
5 | def file_read(inp_f):
6 | with open(inp_f, 'rb') as f:
7 | tensor = torch.frombuffer(f.read(), dtype=torch.uint8)
8 | return tensor.cuda()
9 |
10 | def main():
11 | args = parse_read_arguments()
12 | input_file = args.input_file
13 | file_sz = os.path.getsize(input_file)
14 | cnt = args.loop
15 |
16 | t = timeit.Timer(functools.partial(file_read, input_file))
17 | py_t = t.timeit(cnt)
18 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
19 | print(f'py load_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/deepnvme/file_access/py_store_cpu_tensor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os, timeit, functools
3 | import pathlib
4 | from utils import parse_write_arguments, GIGA_UNIT
5 |
6 | def file_write(out_f, tensor):
7 | with open(out_f, 'wb') as f:
8 | f.write(tensor.numpy(force=True))
9 |
10 | def main():
11 | args = parse_write_arguments()
12 | cnt = args.loop
13 | output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
14 | pathlib.Path(output_file).unlink(missing_ok=True)
15 | file_sz = args.mb_size*(1024**2)
16 | cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
17 |
18 | t = timeit.Timer(functools.partial(file_write, output_file, cpu_tensor))
19 |
20 | py_t = t.timeit(cnt)
21 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
22 | print(f'py store_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
23 | pathlib.Path(output_file).unlink(missing_ok=True)
24 |
25 | if __name__ == "__main__":
26 | main()
27 |
--------------------------------------------------------------------------------
/deepnvme/file_access/py_store_gpu_tensor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os, timeit, functools
3 | import pathlib
4 | from utils import parse_write_arguments, GIGA_UNIT
5 |
6 | def file_write(out_f, tensor):
7 | with open(out_f, 'wb') as f:
8 | f.write(tensor.numpy(force=True))
9 |
10 | def main():
11 | args = parse_write_arguments()
12 | cnt = args.loop
13 | output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
14 | pathlib.Path(output_file).unlink(missing_ok=True)
15 | file_sz = args.mb_size*(1024**2)
16 | gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
17 |
18 | t = timeit.Timer(functools.partial(file_write, output_file, gpu_tensor))
19 |
20 | py_t = t.timeit(cnt)
21 | py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
22 | print(f'py store_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
23 | pathlib.Path(output_file).unlink(missing_ok=True)
24 |
25 |
26 | if __name__ == "__main__":
27 | main()
28 |
--------------------------------------------------------------------------------
/deepnvme/file_access/run_load_tensor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ $# -ne 1 ]]; then
4 | echo "Usage: $0 "
5 | exit 1
6 | fi
7 |
8 | input_file=$1
9 | if ! [[ -f "$input_file" ]]; then
10 | echo "Error: $input_file does not exist"
11 | exit 1
12 | fi
13 |
14 |
15 | echo "Running load tensor examples using $input_file"
16 | for f in aio_load_cpu_tensor.py aio_load_gpu_tensor.py \
17 | gds_load_gpu_tensor.py \
18 | py_load_cpu_tensor.py py_load_gpu_tensor.py; do
19 | cmd="python $f --input_file $input_file"
20 | sync
21 | echo $cmd
22 | eval $cmd
23 | sleep 2
24 | done
25 |
26 |
27 |
--------------------------------------------------------------------------------
/deepnvme/file_access/run_store_tensor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ $# -ne 1 ]]; then
4 | echo "Usage: $0 "
5 | exit 1
6 | fi
7 |
8 | output_folder=$1
9 | if ! [[ -d "$output_folder" ]]; then
10 | echo "Error: $output_folder does not exist"
11 | exit 1
12 | fi
13 |
14 |
15 | echo "Running store tensor examples using $output_folder"
16 | for f in aio_store_cpu_tensor.py aio_store_gpu_tensor.py \
17 | gds_store_gpu_tensor.py \
18 | py_store_cpu_tensor.py py_store_gpu_tensor.py; do
19 | cmd="python $f --nvme_folder $output_folder"
20 | sync
21 | echo $cmd
22 | eval $cmd
23 | sleep 2
24 | done
25 |
26 |
27 |
--------------------------------------------------------------------------------
/deepnvme/model_checkpoint/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 |
--------------------------------------------------------------------------------
/deepnvme/zero_inference/media/nvme_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/nvme_config.png
--------------------------------------------------------------------------------
/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png
--------------------------------------------------------------------------------
/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png
--------------------------------------------------------------------------------
/inference/huggingface/automatic-speech-recognition/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DeepSpeed Huggingface Automatic Speech Recognition Examples
3 |
4 | # Setup
5 | Python dependencies:
6 |
7 | pip install -r requirements.txt
8 |
9 |
10 | For the `test-wav2vec.py` speech model example, you may also need to install the `libsndfile1-dev` generic library:
11 |
12 | sudo apt-get install libsndfile1-dev
13 |
14 |
15 | # Usage
16 | Examples can be run as follows:
17 | deepspeed --num_gpus [number of GPUs] test-[model].py
18 |
19 | # Example Output
20 | Command:
21 |
22 | deepspeed --num_gpus 1 test-wav2vec2.py
23 |
24 |
25 | Output:
26 |
27 | WER: 0.03383673158855752
28 |
29 |
--------------------------------------------------------------------------------
/inference/huggingface/automatic-speech-recognition/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | soundfile
5 | jiwer
6 | datasets
7 |
--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DeepSpeed Huggingface Fill Mask Examples
3 |
4 | # Setup
5 | Python dependencies:
6 |
7 | pip install -r requirements.txt
8 |
9 |
10 | # Usage
11 | Examples can be run as follows:
12 | deepspeed --num_gpus [number of GPUs] test-[model].py
13 |
14 | # Example Output
15 | Command:
16 |
17 | deepspeed --num_gpus 1 test-roberta.py
18 |
19 |
20 | Output:
21 |
22 | [{'score': 0.40290409326553345, 'token': 3742, 'token_str': ' Internet', 'sequence': 'The invention of the Internet revolutionized the way we communicate with each other.'}, {'score': 0.20314466953277588, 'token': 7377, 'token_str': ' telephone', 'sequence': 'The invention of the telephone revolutionized the way we communicate with each other.'}, {'score': 0.17653286457061768, 'token': 2888, 'token_str': ' internet', 'sequence': 'The invention of the internet revolutionized the way we communicate with each other.'}, {'score': 0.06900821626186371, 'token': 4368, 'token_str': ' smartphone', 'sequence': 'The invention of the smartphone revolutionized the way we communicate with each other.'}, {'score': 0.03270129859447479, 'token': 3034, 'token_str': ' computer', 'sequence': 'The invention of the computer revolutionized the way we communicate with each other.'}]
23 |
24 |
--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 |
--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/test-electra.py:
--------------------------------------------------------------------------------
1 | from transformers import pipeline
2 | import transformers
3 | import deepspeed
4 | import torch
5 | import os
6 | from transformers.models.electra.modeling_electra import ElectraLayer
7 | from deepspeed.accelerator import get_accelerator
8 |
9 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
10 | world_size = int(os.getenv('WORLD_SIZE', '4'))
11 |
12 | pipe = pipeline('fill-mask', model="google/electra-base-generator",
13 | tokenizer="google/electra-base-generator")
14 |
15 | # The injection_policy shows two things:
16 | # 1. which layer module we need to add Tensor-Parallelism
17 | # 2. the name of one or several linear layers: a) attention_output (both encoder and decoder),
18 | # and b) transformer output
19 | pipe.model = deepspeed.init_inference(
20 | pipe.model,
21 | mp_size=world_size,
22 | dtype=torch.float,
23 | injection_policy={ElectraLayer: ('output.dense')}
24 | )
25 | pipe.device = torch.device(get_accelerator().device_name(local_rank))
26 | output = pipe(f"HuggingFace is creating a {pipe.tokenizer.mask_token} that the community uses to solve NLP tasks.")
27 |
28 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
29 | print(output)
30 |
--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/test-roberta.py:
--------------------------------------------------------------------------------
1 | from transformers import pipeline
2 | import transformers
3 | import deepspeed
4 | import torch
5 | import os
6 | from transformers.models.roberta.modeling_roberta import RobertaLayer
7 | from deepspeed.accelerator import get_accelerator
8 |
9 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
10 | world_size = int(os.getenv('WORLD_SIZE', '4'))
11 |
12 | pipe = pipeline('fill-mask', model="roberta-large", device=local_rank)
13 |
14 | # The injection_policy shows two things:
15 | # 1. which layer module we need to add Tensor-Parallelism
16 | # 2. the name of several linear layers: a) attention_output (both encoder and decoder),
17 | # and b) transformer output
18 |
19 | pipe.model = deepspeed.init_inference(
20 | pipe.model,
21 | mp_size=world_size,
22 | dtype=torch.float,
23 | injection_policy={RobertaLayer: ('output.dense')}
24 | )
25 |
26 | pipe.device = torch.device(get_accelerator().device_name(local_rank))
27 | output = pipe("The invention of the revolutionized the way we communicate with each other.")
28 |
29 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
30 | print(output)
31 |
--------------------------------------------------------------------------------
/inference/huggingface/stable-diffusion/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DeepSpeed Stable Diffusion Example
3 |
4 | # Setup
5 | Python dependencies:
6 |
7 | pip install -r requirements.txt
8 |
9 |
10 | # Usage
11 | Examples can be run as follows:
12 | deepspeed --num_gpus [number of GPUs] test-[model].py
13 |
14 | NOTE: Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1`.
15 |
16 | # Example Output
17 | Command:
18 |
19 | deepspeed --num_gpus 1 test-stable-diffusion.py
20 |
21 |
22 | Output:
23 |
24 | ./baseline.png
25 | ./deepspeed.png
26 |
27 |
--------------------------------------------------------------------------------
/inference/huggingface/stable-diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | diffusers>=0.22.3
4 | triton==2.0.0.dev20221202
5 |
--------------------------------------------------------------------------------
/inference/huggingface/text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.28.1
4 |
--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DeepSpeed Huggingface Text Generation Script
3 |
4 | # Setup
5 | Python dependencies:
6 |
7 | pip install -r requirements.txt
8 |
9 |
10 | # Usage
11 | The [`test-run-generation.py`](./test-run-generation.py) example can be run using [test-gpt.sh](./test-gpt.sh), which serves as an example of how to run the script.
12 |
13 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
14 | --model_type=gpt2 \
15 | --model_name_or_path=gpt2-xl \
16 | --sample_input single_query.txt \
17 | --fp16 \
18 | --ds-inference
19 |
20 |
21 | # Example Output
22 | Command:
23 |
24 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
25 | --model_type=gpt2 \
26 | --model_name_or_path=gpt2-xl \
27 | --sample_input single_query.txt \
28 | --fp16 \
29 | --ds-inference
30 |
31 |
32 | Output:
33 |
34 | === GENERATED SEQUENCE 1 ===
35 | What is DeepSpeed?
36 |
37 | DeepSpeed is a multi-dimensional data compression framework designed to achieve high compression ratio on human readable
38 |
39 |
--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | numpy
5 | sentencepiece
6 | protobuf
7 |
--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/single_query.txt:
--------------------------------------------------------------------------------
1 | What is DeepSpeed?
2 |
--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/test-gpt.sh:
--------------------------------------------------------------------------------
1 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
2 | --model_type=gpt2 \
3 | --model_name_or_path=gpt2-xl \
4 | --sample_input single_query.txt \
5 | --fp16 \
6 | --ds-inference
7 |
--------------------------------------------------------------------------------
/inference/huggingface/text2text-generation/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DeepSpeed Huggingface Text2Text Generation Examples
3 |
4 | # Setup
5 | Python dependencies:
6 |
7 | pip install -r requirements.txt
8 |
9 |
10 | # Usage
11 | Examples can be run as follows:
12 | deepspeed --num_gpus [number of GPUs] test-[model].py
13 |
14 | # Example Output
15 | Command:
16 |
17 | deepspeed --num_gpus 1 test-t5.py
18 |
19 |
20 | Output:
21 |
22 | [{'generated_text': 'd review: this is the best cast iron skillet. Great review! Great review! Great'}]
23 |
24 |
--------------------------------------------------------------------------------
/inference/huggingface/text2text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | sentencepiece
5 | google
6 | protobuf
7 |
--------------------------------------------------------------------------------
/inference/huggingface/text2text-generation/test-t5.py:
--------------------------------------------------------------------------------
1 | from transformers import pipeline
2 | import transformers
3 | import deepspeed
4 | import torch
5 | import os
6 | from transformers.models.t5.modeling_t5 import T5Block
7 |
8 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
9 | world_size = int(os.getenv('WORLD_SIZE', '4'))
10 |
11 | pipe = pipeline("text2text-generation", model="google/t5-v1_1-small", device=local_rank)
12 |
13 | # The injection_policy shows two things:
14 | # 1. which layer module we need to add Tensor-Parallelism
15 | # 2. the name of several linear layers: a) attention_output (both encoder and decoder),
16 | # and b) transformer output
17 |
18 | pipe.model = deepspeed.init_inference(
19 | pipe.model,
20 | mp_size=world_size,
21 | dtype=torch.float,
22 | injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')}
23 | )
24 |
25 | pipe.device = torch.device(f'cuda:{local_rank}')
26 | output = pipe("Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy")
27 |
28 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
29 | print(output)
30 |
--------------------------------------------------------------------------------
/inference/huggingface/translation/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DeepSpeed Huggingface Translation Examples
3 |
4 | # Setup
5 | Python dependencies:
6 |
7 | pip install -r requirements.txt
8 |
9 |
10 | # Usage
11 | Examples can be run as follows:
12 | deepspeed --num_gpus [number of GPUs] test-[model].py
13 |
14 | # Example Output
15 | Command:
16 |
17 | deepspeed --num_gpus 1 test-t5-base.py
18 |
19 |
20 | Output:
21 |
22 | [{'translation_text': 'Le renard brun rapide saute au-dessus du chien lazy.'}]
23 |
24 |
--------------------------------------------------------------------------------
/inference/huggingface/translation/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | sentencepiece
5 | google
6 | protobuf
7 |
--------------------------------------------------------------------------------
/inference/huggingface/translation/test-t5-base.py:
--------------------------------------------------------------------------------
1 | import deepspeed
2 | import torch
3 | import os
4 | from transformers import pipeline
5 | from transformers.models.t5.modeling_t5 import T5Block
6 |
7 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
8 | world_size = int(os.getenv('WORLD_SIZE', '4'))
9 |
10 | # Init translator
11 | translator = pipeline("translation_en_to_fr", model="t5-base", tokenizer="t5-base", device=local_rank)
12 |
13 | # DeepSpeed init_inference API
14 | translator.model = deepspeed.init_inference(translator.model,
15 | mp_size=world_size,
16 | dtype=torch.float,
17 | injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')}
18 | )
19 |
20 | # Translate text
21 | text = "The quick brown fox jumps over the lazy dog."
22 | translation = translator(text)
23 |
24 | # Print translation
25 | print(translation)
26 |
--------------------------------------------------------------------------------
/inference/huggingface/zero_inference/images/over_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/inference/huggingface/zero_inference/images/over_v1.png
--------------------------------------------------------------------------------
/inference/huggingface/zero_inference/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed>=0.10.1
2 | torch
3 | transformers @ git+https://github.com/tjruwase/transformers@kvcache-offload-cpu
4 | packaging
5 | accelerate
6 |
--------------------------------------------------------------------------------
/inference/huggingface/zero_inference/run_llama2_70b_a6000.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export USE_TF=0
3 | BASE_LOG_DIR=~/experiments/zero_inference/
4 | MODEL_NAME="Llama-2-70b-hf"
5 | FULL_MODEL_NAME="meta-llama/${MODEL_NAME}"
6 | QB=4
7 |
8 | BSZ=64
9 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
10 | mkdir -p $LOG_DIR
11 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin.txt
12 |
13 | BSZ=96
14 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
15 | mkdir -p $LOG_DIR
16 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin_q${QB}.txt
17 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv.txt
18 |
19 |
20 | BSZ=200
21 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
22 | mkdir -p $LOG_DIR
23 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv_q${QB}.txt
24 |
--------------------------------------------------------------------------------
/inference/mii/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed MII Examples
2 |
3 | Install the requirements by running `pip install -r requirements.txt`.
4 |
5 | Once [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. See the scripts in [non-persistent](./non-persistent/) and [persistent](./persistent/) for examples. Details on the code implemented in these scripts can be found on our [Getting Started guide for MII](https://github.com/deepspeedai/DeepSpeed-mii#getting-started-with-mii).
6 |
--------------------------------------------------------------------------------
/inference/mii/non-persistent/README.md:
--------------------------------------------------------------------------------
1 | # Non-Persistent Pipeline Examples
2 |
3 | The `pipeline.py` script can be used to run any of the [supported
4 | models](https://github.com/deepspeedai/DeepSpeed-mii#supported-models). Provide
5 | the HuggingFace model name, maximum generated tokens, and prompt(s). The
6 | generated responses will be printed in the terminal:
7 |
8 | ```shell
9 | $ python pipeline.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is"
10 | ```
11 |
12 | Tensor-parallelism can be controlled using the `deepspeed` launcher and setting
13 | `--num_gpus`:
14 |
15 | ```shell
16 | $ deepspeed --num_gpus 2 pipeline.py
17 | ```
18 |
19 | ## Model-Specific Examples
20 |
21 | For convenience, we also provide a set of scripts to quickly test the MII
22 | Pipeline with some popular text-generation models:
23 |
24 | | Model | Launch command |
25 | |-------|----------------|
26 | | [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) | `$ python llama2.py` |
27 | | [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) | `$ python falcon.py` |
28 | | [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | `$ deepspeed --num_gpus 2 mixtral.py` |
--------------------------------------------------------------------------------
/inference/mii/non-persistent/falcon.py:
--------------------------------------------------------------------------------
1 | import mii
2 |
3 | pipe = mii.pipeline("tiiuae/falcon-7b")
4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
5 | if pipe.is_rank_0:
6 | print(responses[0])
7 |
--------------------------------------------------------------------------------
/inference/mii/non-persistent/llama2.py:
--------------------------------------------------------------------------------
1 | import mii
2 |
3 | pipe = mii.pipeline("meta-llama/Llama-2-7b-hf")
4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
5 | if pipe.is_rank_0:
6 | print(responses[0])
7 |
--------------------------------------------------------------------------------
/inference/mii/non-persistent/mixtral.py:
--------------------------------------------------------------------------------
1 | import mii
2 |
3 | pipe = mii.pipeline("mistralai/Mixtral-8x7B-v0.1")
4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
5 | if pipe.is_rank_0:
6 | print(responses[0])
7 |
--------------------------------------------------------------------------------
/inference/mii/non-persistent/pipeline.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import mii
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
6 | parser.add_argument(
7 | "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"]
8 | )
9 | parser.add_argument("--max-new-tokens", type=int, default=128)
10 | args = parser.parse_args()
11 |
12 | pipe = mii.pipeline(args.model)
13 | responses = pipe(
14 | args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True
15 | )
16 |
17 | if pipe.is_rank_0:
18 | for r in responses:
19 | print(r, "\n", "-" * 80, "\n")
20 |
--------------------------------------------------------------------------------
/inference/mii/persistent/README.md:
--------------------------------------------------------------------------------
1 | # Persistent Deployment Examples
2 |
3 | The `serve.py` script can be used to create an inference server for any of the
4 | [supported models](https://github.com/deepspeedai/DeepSpeed-mii#supported-models).
5 | Provide the HuggingFace model name and tensor-parallelism (use the default
6 | values and run `$ python serve.py` for a single-GPU
7 | [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
8 | deployment):
9 |
10 | ```shell
11 | $ python serve.py --model "mistralai/Mistral-7B-v0.1" tensor-parallel 1
12 | ```
13 |
14 | Connect to the persistent deployment and generate text with `client.py`. Provide
15 | the HuggingFace model name, maximum generated tokens, and prompt(s) (or if you
16 | are using the default values, run `$ python client.py`):
17 |
18 | ```shell
19 | $ python client.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is"
20 | ```
21 |
22 | Shutdown the persistent deployment with `terminate.py`. Provide the HuggingFace
23 | model name (or if you are using the default values, run `$ python
24 | terminate.py`):
25 |
26 | ```shell
27 | $ python terminate.py --model "mistralai/Mistral-7B-v0.1
28 | ```
--------------------------------------------------------------------------------
/inference/mii/persistent/client.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import mii
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
6 | parser.add_argument(
7 | "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"]
8 | )
9 | parser.add_argument("--max-new-tokens", type=int, default=128)
10 | args = parser.parse_args()
11 |
12 | client = mii.client(args.model)
13 | responses = client(
14 | args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True
15 | )
16 |
17 | for r in responses:
18 | print(r, "\n", "-" * 80, "\n")
19 |
--------------------------------------------------------------------------------
/inference/mii/persistent/serve.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import mii
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
6 | parser.add_argument("--tensor-parallel", type=int, default=1)
7 | args = parser.parse_args()
8 |
9 | mii.serve(args.model, tensor_parallel=args.tensor_parallel)
10 |
11 | print(f"Serving model {args.model} on {args.tensor_parallel} GPU(s).")
12 | print(f"Run `python client.py --model {args.model}` to connect.")
13 | print(f"Run `python terminate.py --model {args.model}` to terminate.")
14 |
--------------------------------------------------------------------------------
/inference/mii/persistent/terminate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import mii
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
6 | args = parser.parse_args()
7 |
8 | client = mii.client(args.model)
9 | client.terminate_server()
10 |
11 | print(f"Terminated server for model {args.model}.")
12 |
--------------------------------------------------------------------------------
/inference/mii/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed-mii>=0.1.3
2 |
--------------------------------------------------------------------------------
/inference/sglang/README.md:
--------------------------------------------------------------------------------
1 | # SGLang + ZeRO-Inference Examples
2 | This folder contains examples of [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) integration into [SGLang](https://github.com/sgl-project/sglang) framework. This integration enable SGLang to inference massive models (e.g., with 100s billion parameters) on a single GPU through the NVMe/CPU offloading optimizations of ZeRO-Inference.
3 |
4 | ## Prerequisites
5 | 1. DeepSpeed version >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6)
6 | 2. SGLang: These examples require our SGLang [fork](https://github.com/tjruwase/sglang/tree/zero-inference). We plan to upstream the SGLang changes to main branch.
7 |
8 |
9 | ## Examples
10 | The examples comprise of the following:
11 | 1. bash scripts that benchmark SGLang throughput in [offline mode](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) with different ZeRO-Inference offloading options. Each script runs a inference on a different model with a prompt of 512 tokens, output of 32 tokens, and batch size of 128.
12 | 2. DeepSpeed config files corresponding to ZeRO-Inference offloading: (i) CPU offload, (ii) NVMe offload with AIO, and (iii) NVMe offloading with NVIDIA GDS.
--------------------------------------------------------------------------------
/inference/sglang/ds_offload_cpu.json:
--------------------------------------------------------------------------------
1 | {
2 | "zero_optimization": {
3 | "stage": 3,
4 | "stage3_prefetch_bucket_size": "auto",
5 | "stage3_param_persistence_threshold": "auto",
6 | "stage3_max_live_parameters": "auto",
7 | "offload_param": {
8 | "device": "cpu",
9 | "pin_memory": true,
10 | "buffer_size": "auto"
11 | }
12 | },
13 | "train_batch_size": 1
14 | }
15 |
--------------------------------------------------------------------------------
/inference/sglang/ds_offload_nvme_aio.json:
--------------------------------------------------------------------------------
1 | {
2 | "zero_optimization": {
3 | "stage": 3,
4 | "stage3_prefetch_bucket_size": "auto",
5 | "stage3_param_persistence_threshold": "auto",
6 | "stage3_max_live_parameters": "auto",
7 | "offload_param": {
8 | "device": "nvme",
9 | "nvme_path": "/local_nvme/sglang",
10 | "pin_memory": true,
11 | "buffer_size": "auto",
12 | "buffer_count": 5
13 | }
14 | },
15 | "aio": {
16 | "block_size": 8388608,
17 | "queue_depth": 32,
18 | "intra_op_parallelism": 8,
19 | "single_submit": false,
20 | "overlap_events": true,
21 | "use_gds": false
22 | },
23 | "train_batch_size": 1
24 | }
25 |
--------------------------------------------------------------------------------
/inference/sglang/ds_offload_nvme_gds.json:
--------------------------------------------------------------------------------
1 | {
2 | "zero_optimization": {
3 | "stage": 3,
4 | "stage3_prefetch_bucket_size": "auto",
5 | "stage3_param_persistence_threshold": "auto",
6 | "stage3_max_live_parameters": "auto",
7 | "offload_param": {
8 | "device": "nvme",
9 | "nvme_path": "/local_nvme/sglang",
10 | "pin_memory": true,
11 | "buffer_size": "auto",
12 | "buffer_count": 3
13 | }
14 | },
15 | "aio": {
16 | "block_size": 8388608,
17 | "queue_depth": 32,
18 | "intra_op_parallelism": 8,
19 | "single_submit": false,
20 | "overlap_events": true,
21 | "use_gds": true
22 | },
23 | "train_batch_size": 1
24 | }
25 |
--------------------------------------------------------------------------------
/inference/sglang/run_llama3_1B.sh:
--------------------------------------------------------------------------------
1 | export LOCAL_RANK=0
2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
3 | BATCH_SIZE=128
4 | MODEL_NAME="meta-llama/Llama-3.2-1B"
5 |
6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
9 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
10 |
11 |
12 |
--------------------------------------------------------------------------------
/inference/sglang/run_llama3_70B.sh:
--------------------------------------------------------------------------------
1 | export LOCAL_RANK=0
2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
3 | BATCH_SIZE=128
4 | MODEL_NAME="meta-llama/Meta-Llama-3.1-70B"
5 |
6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
9 | # python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
10 |
--------------------------------------------------------------------------------
/inference/sglang/run_llama3_8B.sh:
--------------------------------------------------------------------------------
1 | export LOCAL_RANK=0
2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
3 | BATCH_SIZE=128
4 | MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct"
5 |
6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
9 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
10 |
--------------------------------------------------------------------------------
/scripts/check-license.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 |
7 | from __future__ import annotations
8 | """
9 | Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
10 | """
11 |
12 | import subprocess
13 | import sys
14 |
15 |
16 | def err(s: str) -> None:
17 | print(s, file=sys.stderr)
18 |
19 |
20 | COPYRIGHT = [
21 | r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$",
22 | r"^\(\/\/\|#\) DeepSpeed Team$"
23 | ]
24 |
25 | success = True
26 | failures = []
27 | for f in sys.argv[1:]:
28 | for copyright_line in COPYRIGHT:
29 | if not success:
30 | continue
31 | res = subprocess.run(["git", "grep", "--quiet", "-e", copyright_line, f], capture_output=True)
32 | if res.returncode == 1:
33 | success = False
34 | failures.append(f)
35 | elif res.returncode == 2:
36 | err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
37 | err(res.stderr.decode("utf-8"))
38 | sys.exit(2)
39 |
40 | if not success:
41 | err(f'{failures}: Missing license at top of file')
42 | err(res.stdout.decode("utf-8"))
43 | sys.exit(1)
44 |
--------------------------------------------------------------------------------
/training/BingBertGlue/glue_bert_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32,
3 | "train_micro_batch_size_per_gpu": 1,
4 | "steps_per_print": 10,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 2e-5,
9 | "weight_decay": 0.0,
10 | "bias_correction": true
11 | }
12 | },
13 | "gradient_clipping": 1.0,
14 | "fp16": {
15 | "enabled": false
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/training/BingBertGlue/glue_bert_large.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32,
3 | "train_micro_batch_size_per_gpu": 1,
4 | "steps_per_print": 10,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 2e-5,
9 | "weight_decay": 0.0,
10 | "bias_correction": true
11 | }
12 | },
13 | "gradient_clipping": 1.0,
14 | "fp16": {
15 | "enabled": false
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/training/BingBertGlue/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 | BertForMaskedLM, BertForNextSentencePrediction,
5 | BertForSequenceClassification, BertForMultipleChoice,
6 | BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 |
--------------------------------------------------------------------------------
/training/BingBertGlue/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | def main():
3 | import sys
4 | try:
5 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
6 | except ModuleNotFoundError:
7 | print(
8 | "pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
9 | "In that case, it requires TensorFlow to be installed. Please see "
10 | "https://www.tensorflow.org/install/ for installation instructions."
11 | )
12 | raise
13 |
14 | if len(sys.argv) != 5:
15 | # pylint: disable=line-too-long
16 | print(
17 | "Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`"
18 | )
19 | else:
20 | PYTORCH_DUMP_OUTPUT = sys.argv.pop()
21 | TF_CONFIG = sys.argv.pop()
22 | TF_CHECKPOINT = sys.argv.pop()
23 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG,
24 | PYTORCH_DUMP_OUTPUT)
25 |
26 |
27 | if __name__ == '__main__':
28 | main()
29 |
--------------------------------------------------------------------------------
/training/BingBertGlue/turing/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch.distributed as dist
3 |
4 | logging.basicConfig(
5 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
6 | datefmt='%m/%d/%Y %H:%M:%S',
7 | level=logging.INFO)
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class Logger():
12 | def __init__(self, cuda=False):
13 | self.logger = logging.getLogger(__name__)
14 | self.cuda = cuda
15 |
16 | def info(self, message, *args, **kwargs):
17 | if (self.cuda and dist.get_rank() == 0) or not self.cuda:
18 | self.logger.info(message, *args, **kwargs)
19 |
20 | def error(self, message, *args, **kwargs):
21 | self.logger.error(message, *args, **kwargs)
22 |
--------------------------------------------------------------------------------
/training/BingBertGlue/turing/text.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | PAD = 0
4 |
5 |
6 | def mask(x):
7 | return x != PAD
8 |
9 |
10 | def torch_long(x):
11 | return torch.LongTensor(x)
12 |
--------------------------------------------------------------------------------
/training/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 96,
3 | "train_micro_batch_size_per_gpu": 3,
4 | "steps_per_print": 100,
5 | "optimizer": {
6 | "type": "OnebitAdam",
7 | "params": {
8 | "lr": 3e-5,
9 | "freeze_step": 400,
10 | "weight_decay": 0.0,
11 | "bias_correction": false,
12 | "cuda_aware": false,
13 | "comm_backend_name": "mpi"
14 | }
15 | },
16 | "gradient_clipping": 1.0,
17 | "fp16": {
18 | "enabled": true
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/training/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 96,
3 | "train_micro_batch_size_per_gpu": 3,
4 | "steps_per_print": 100,
5 | "optimizer": {
6 | "type": "OnebitAdam",
7 | "params": {
8 | "lr": 3e-5,
9 | "freeze_step": 400,
10 | "weight_decay": 0.0,
11 | "bias_correction": false,
12 | "cuda_aware": true,
13 | "comm_backend_name": "mpi"
14 | }
15 | },
16 | "gradient_clipping": 1.0,
17 | "fp16": {
18 | "enabled": true
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/training/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 96,
3 | "train_micro_batch_size_per_gpu": 3,
4 | "steps_per_print": 100,
5 | "optimizer": {
6 | "type": "OnebitAdam",
7 | "params": {
8 | "lr": 3e-5,
9 | "freeze_step": 400,
10 | "weight_decay": 0.0,
11 | "bias_correction": false,
12 | "cuda_aware": false,
13 | "comm_backend_name": "nccl"
14 | }
15 | },
16 | "gradient_clipping": 1.0,
17 | "fp16": {
18 | "enabled": true
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/training/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 1024,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 4096,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 16,
15 | "num_hidden_layers": 24,
16 | "pad_token_id": 0,
17 | "type_vocab_size": 2,
18 | "vocab_size": 30522
19 | }
20 |
--------------------------------------------------------------------------------
/training/BingBertSquad/deepspeed_bsz24_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 24,
3 | "train_micro_batch_size_per_gpu": 3,
4 | "steps_per_print": 10,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 3e-5,
9 | "weight_decay": 0.0,
10 | "bias_correction": false
11 | }
12 | },
13 | "gradient_clipping": 1.0,
14 | "fp16": {
15 | "enabled": true
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/training/BingBertSquad/evaluate-v1.1.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import evaluate as eval
4 |
5 | if __name__ == '__main__':
6 | expected_version = '1.1'
7 | parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' +
8 | expected_version)
9 | parser.add_argument('dataset_file', help='Dataset file')
10 | parser.add_argument('prediction_file', help='Prediction File')
11 | args = parser.parse_args()
12 |
13 | print(
14 | json.dumps(
15 | eval.evaluate(expected_version, args.dataset_file,
16 | args.prediction_file)))
17 |
--------------------------------------------------------------------------------
/training/BingBertSquad/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 | BertForMaskedLM, BertForNextSentencePrediction,
5 | BertForSequenceClassification, BertForMultipleChoice,
6 | BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 |
--------------------------------------------------------------------------------
/training/DeepSpeed-Domino/requirements.txt:
--------------------------------------------------------------------------------
1 | apex
2 | deepspeed>=0.16.6
3 | nltk
4 | pybind11
5 | transformers
6 | regex
7 |
--------------------------------------------------------------------------------
/training/HelloDeepSpeed/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==1.13.3
2 | transformers==4.5.1
3 | fire==0.4.0
4 | pytz==2021.1
5 | loguru==0.5.3
6 | sh==1.14.2
7 | pytest==6.2.5
8 | tqdm==4.62.3
--------------------------------------------------------------------------------
/training/HelloDeepSpeed/run.sh:
--------------------------------------------------------------------------------
1 | python train_bert.py --checkpoint_dir ./experiment
2 |
--------------------------------------------------------------------------------
/training/HelloDeepSpeed/run_ds.sh:
--------------------------------------------------------------------------------
1 | deepspeed --bind_cores_to_rank train_bert_ds.py --checkpoint_dir experiment_deepspeed $@
2 |
--------------------------------------------------------------------------------
/training/HelloDeepSpeed/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/HelloDeepSpeed/tests/__init__.py
--------------------------------------------------------------------------------
/training/MoQ/README.md:
--------------------------------------------------------------------------------
1 | # Not maintained / deprecated
2 |
3 | > __Warning__
4 | > This folder/feature has been deprecated. Feel free to test and submit an issue if you run into errors.
5 |
6 |
--------------------------------------------------------------------------------
/training/MoQ/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 |
--------------------------------------------------------------------------------
/training/MoQ/run.sh:
--------------------------------------------------------------------------------
1 | OOO=output
2 | MASTER_PORT=12345
3 | GPU=0
4 |
5 | for TSK in qnli #stsb mrpc cola wnli sst2 rte qnli qqp mnli
6 | do
7 |
8 | if [ $TSK == wnli ] || [ $TSK == mrpc ]
9 | then
10 | EPOCH_NUM=5
11 | else
12 | EPOCH_NUM=3
13 | fi
14 |
15 | if [ $TSK == qqp ] || [ $TSK == mnli ]
16 | then
17 | TEST_JSON=test_long.json
18 | else
19 | TEST_JSON=test.json
20 | fi
21 |
22 | PORT=$((MASTER_PORT+GPU))
23 |
24 | rm -rvf ./$OOO/${TSK}
25 |
26 | CUDA_VISIBLE_DEVICES=$GPU python -m torch.distributed.launch \
27 | --master_port $PORT \
28 | --nproc_per_node 1 run_glue.py \
29 | --model_name_or_path bert-base-cased \
30 | --task_name $TSK \
31 | --do_train \
32 | --do_eval \
33 | --max_seq_length 128 \
34 | --per_device_train_batch_size 32 \
35 | --learning_rate 2e-5 \
36 | --num_train_epochs $EPOCH_NUM \
37 | --output_dir ./$OOO/$TSK/ \
38 | --fp16 \
39 | --warmup_steps 2 \
40 | --deepspeed test.json
41 |
42 | done
43 |
--------------------------------------------------------------------------------
/training/MoQ/test.json:
--------------------------------------------------------------------------------
1 | {
2 | "steps_per_print": 10,
3 | "gradient_clipping": 1.0,
4 | "fp16": {
5 | "initial_scale_power": 16,
6 | "enabled": true
7 | },
8 | "quantize_training": {
9 | "enabled": true,
10 | "quantize_verbose": true,
11 | "quantizer_kernel": true,
12 | "quantize_algo": {
13 | "q_type": "symmetric"
14 | },
15 | "quantize_bits": {
16 | "start_bits": 16,
17 | "target_bits": 8
18 | },
19 | "quantize_schedule": {
20 | "quantize_period": 400,
21 | "schedule_offset": 0
22 | },
23 | "quantize_groups": 8
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/training/autotuning/.gitignore:
--------------------------------------------------------------------------------
1 | autotuning_results*
2 | autotuning_exps*
3 | output*
4 | mnli
5 |
--------------------------------------------------------------------------------
/training/autotuning/README.md:
--------------------------------------------------------------------------------
1 | # Autotuning Examples
2 |
3 | This showcases the [autotuning](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS).
4 |
--------------------------------------------------------------------------------
/training/autotuning/hf/bert-base/ds_config_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "autotuning": {
4 | "enabled": true,
5 | "overwrite": false,
6 | "max_train_batch_size": 4096,
7 | "arg_mappings": {
8 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
9 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
10 | }
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/training/autotuning/hf/bert-large/ds_config_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "autotuning": {
4 | "enabled": true,
5 | "overwrite": false,
6 | "arg_mappings": {
7 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
8 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/training/autotuning/hf/deberta/ds_config_fp16_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "fp16": {
4 | "enabled": true,
5 | "initial_scale_power": 12
6 | },
7 | "autotuning": {
8 | "enabled": true,
9 | "overwrite": false,
10 | "fast": true,
11 | "arg_mappings": {
12 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
13 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
14 | }
15 | }
16 | }
--------------------------------------------------------------------------------
/training/autotuning/hf/distilbert/ds_config_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "autotuning": {
4 | "enabled": true,
5 | "overwrite": false,
6 | "max_train_batch_size": 4096,
7 | "arg_mappings": {
8 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
9 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
10 | }
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "fp16": {
4 | "enabled": true
5 | },
6 | "autotuning": {
7 | "enabled": true,
8 | "overwrite": false,
9 | "fast": true,
10 | "arg_mappings": {
11 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
12 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z0.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 0
5 | },
6 | "fp16": {
7 | "enabled": true
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z1.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 1
5 | },
6 | "fp16": {
7 | "enabled": true
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z2.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 2
5 | },
6 | "fp16": {
7 | "enabled": true
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z3.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 3
5 | },
6 | "fp16": {
7 | "enabled": true
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_tune.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "autotuning": {
4 | "enabled": true,
5 | "overwrite": false,
6 | "fast": true,
7 | "arg_mappings": {
8 | "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
9 | "gradient_accumulation_steps ": "--gradient_accumulation_steps"
10 | }
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z0.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 0
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z1.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z2.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 2
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z3.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_optimization": {
4 | "stage": 3
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_ethernet/deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "ZeroOneAdam",
8 | "params": {
9 | "lr": 4e-4,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "var_freeze_step": 12500,
13 | "local_step_scaler": 32678,
14 | "cuda_aware": false,
15 | "comm_backend_name": "nccl"
16 | }
17 | },
18 | "gradient_clipping": 1.0,
19 |
20 | "wall_clock_breakdown": false,
21 |
22 | "fp16": {
23 | "enabled": true,
24 | "loss_scale": 0,
25 | "initial_scale_power": 16
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_ethernet/deepspeed_bsz4k_01adam_config_seq512_mpi_ethernet.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "ZeroOneAdam",
8 | "params": {
9 | "lr": 2.82e-5,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "var_freeze_step": 155000,
13 | "local_step_scaler": 32678,
14 | "cuda_aware": false,
15 | "comm_backend_name": "nccl"
16 | }
17 | },
18 | "gradient_clipping": 1.0,
19 |
20 | "wall_clock_breakdown": false,
21 |
22 | "fp16": {
23 | "enabled": true,
24 | "loss_scale": 0,
25 | "initial_scale_power": 16
26 | }
27 | }
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_ethernet/ds_train_bert_01adam_bsz4k_seq128_mpi_ethernet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script requires pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
5 | # Read the tutorial for more details:
6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/
7 |
8 | base_dir=`pwd`
9 |
10 | JOB_NAME=01adam_bsz4k_seq128_mpi_ethernet
11 | OUTPUT_DIR=${base_dir}/bert_model_outputs
12 |
13 | mkdir -p $OUTPUT_DIR
14 |
15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
16 | run_cmd="NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed --launcher=openmpi \
17 | ${base_dir}/../../deepspeed_train.py \
18 | --cf ${base_dir}/../../bert_large.json \
19 | --max_seq_length 128 \
20 | --output_dir $OUTPUT_DIR \
21 | --deepspeed \
22 | --print_steps 40 \
23 | --lr_schedule "LE" \
24 | --lr_offset 0.0 \
25 | --job_name $JOB_NAME \
26 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json \
27 | --data_path_prefix /data/bert \
28 | &> ${JOB_NAME}.log"
29 |
30 | echo ${run_cmd}
31 | eval ${run_cmd}
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_infiniband/deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "ZeroOneAdam",
8 | "params": {
9 | "lr": 4e-4,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "var_freeze_step": 12500,
13 | "local_step_scaler": 32678,
14 | "cuda_aware": false,
15 | "comm_backend_name": "nccl"
16 | }
17 | },
18 | "gradient_clipping": 1.0,
19 |
20 | "wall_clock_breakdown": false,
21 |
22 | "fp16": {
23 | "enabled": true,
24 | "loss_scale": 0,
25 | "initial_scale_power": 16
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_infiniband/deepspeed_bsz4k_01adam_config_seq512_mpi_infiniband.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "ZeroOneAdam",
8 | "params": {
9 | "lr": 2.82e-5,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "var_freeze_step": 155000,
13 | "local_step_scaler": 32678,
14 | "cuda_aware": false,
15 | "comm_backend_name": "nccl"
16 | }
17 | },
18 | "gradient_clipping": 1.0,
19 |
20 | "wall_clock_breakdown": false,
21 |
22 | "fp16": {
23 | "enabled": true,
24 | "loss_scale": 0,
25 | "initial_scale_power": 16
26 | }
27 | }
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_infiniband/ds_train_bert_01adam_bsz4k_seq128_mpi_infiniband.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script requires pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
5 | # Read the tutorial for more details:
6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/
7 |
8 | base_dir=`pwd`
9 |
10 | JOB_NAME=01adam_bsz4k_seq128_mpi_infiniband
11 | OUTPUT_DIR=${base_dir}/bert_model_outputs
12 |
13 | mkdir -p $OUTPUT_DIR
14 |
15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
16 | run_cmd="NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \
17 | --cf ${base_dir}/../../bert_large.json \
18 | --max_seq_length 128 \
19 | --output_dir $OUTPUT_DIR \
20 | --deepspeed \
21 | --print_steps 40 \
22 | --lr_schedule "LE" \
23 | --lr_offset 0.0 \
24 | --job_name $JOB_NAME \
25 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json \
26 | --data_path_prefix /data/bert \
27 | &> ${JOB_NAME}.log"
28 |
29 | echo ${run_cmd}
30 | eval ${run_cmd}
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/nccl/deepspeed_bsz4k_01adam_config_seq128_nccl.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "ZeroOneAdam",
8 | "params": {
9 | "lr": 4e-4,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "var_freeze_step": 12500,
13 | "local_step_scaler": 32678,
14 | "cuda_aware": false,
15 | "comm_backend_name": "nccl"
16 | }
17 | },
18 | "gradient_clipping": 1.0,
19 |
20 | "wall_clock_breakdown": false,
21 |
22 | "fp16": {
23 | "enabled": true,
24 | "loss_scale": 0,
25 | "initial_scale_power": 16
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/nccl/deepspeed_bsz4k_01adam_config_seq512_nccl.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "ZeroOneAdam",
8 | "params": {
9 | "lr": 2.82e-5,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "var_freeze_step": 155000,
13 | "local_step_scaler": 32678,
14 | "cuda_aware": false,
15 | "comm_backend_name": "nccl"
16 | }
17 | },
18 | "gradient_clipping": 1.0,
19 |
20 | "wall_clock_breakdown": false,
21 |
22 | "fp16": {
23 | "enabled": true,
24 | "loss_scale": 0,
25 | "initial_scale_power": 16
26 | }
27 | }
--------------------------------------------------------------------------------
/training/bing_bert/01_adam/nccl/ds_train_bert_01adam_bsz4k_seq128_nccl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script requires pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
5 | # Read the tutorial for more details:
6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/
7 |
8 | base_dir=`pwd`
9 |
10 | JOB_NAME=01adam_bsz4k_seq128_nccl
11 | OUTPUT_DIR=${base_dir}/bert_model_outputs
12 |
13 | mkdir -p $OUTPUT_DIR
14 |
15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
16 | run_cmd="NCCL_TREE_THRESHOLD=0 NCCL_DEBUG=INFO \
17 | deepspeed \
18 | ${base_dir}/../../deepspeed_train.py \
19 | --cf ${base_dir}/../../bert_large.json \
20 | --max_seq_length 128 \
21 | --output_dir $OUTPUT_DIR \
22 | --deepspeed \
23 | --print_steps 40 \
24 | --lr_schedule "LE" \
25 | --lr_offset 0.0 \
26 | --job_name $JOB_NAME \
27 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_nccl.json \
28 | --data_path_prefix /data/bert \
29 | &> ${JOB_NAME}.log"
30 |
31 | echo ${run_cmd}
32 | eval ${run_cmd}
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/mpi_ethernet/deepspeed_bsz4k_onebitadam_config_seq128_mpi_ethernet.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitAdam",
8 | "params": {
9 | "lr": 4e-4,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "freeze_step": 23000,
13 | "cuda_aware": false,
14 | "comm_backend_name": "mpi"
15 | }
16 | },
17 | "gradient_clipping": 1.0,
18 |
19 | "wall_clock_breakdown": false,
20 |
21 | "fp16": {
22 | "enabled": true,
23 | "loss_scale": 0,
24 | "initial_scale_power": 16
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/mpi_infiniband/deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitAdam",
8 | "params": {
9 | "lr": 4e-4,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "freeze_step": 23000,
13 | "cuda_aware": true,
14 | "comm_backend_name": "mpi"
15 | }
16 | },
17 | "gradient_clipping": 1.0,
18 |
19 | "wall_clock_breakdown": false,
20 |
21 | "fp16": {
22 | "enabled": true,
23 | "loss_scale": 0,
24 | "initial_scale_power": 16
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/mpi_infiniband/ds_train_bert_onebitadam_bsz4k_seq128_mpi_infiniband.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # If you are able to install pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs),
5 | # we highly recommend you to use the NCCL-based 1-bit Adam
6 | # which has better performance and ease of use
7 | # (see scripts in DeepSpeedExamples/bing_bert/1-bit_adam/nccl
8 | # and read the tutorial for more details:
9 | # https://www.deepspeed.ai/tutorials/onebit-adam/)
10 |
11 | base_dir=`pwd`
12 |
13 | # Where should we save checkpoints and tensorboard events?
14 | JOB_NAME=onebit_adam_4k_seq128_mpi_infiniband
15 | OUTPUT_DIR=${base_dir}/bert_model_outputs
16 |
17 | mkdir -p $OUTPUT_DIR
18 |
19 | NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \
20 | --cf ${base_dir}/../../bert_large.json \
21 | --max_seq_length 128 \
22 | --output_dir $OUTPUT_DIR \
23 | --deepspeed_mpi \
24 | --deepspeed \
25 | --deepspeed_transformer_kernel \
26 | --print_steps 40 \
27 | --lr_schedule "LE" \
28 | --lr_offset 0.0 \
29 | --job_name $JOB_NAME \
30 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json \
31 | --data_path_prefix /data/bert \
32 | &> ${JOB_NAME}.log
33 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/nccl/deepspeed_bsz4k_onebitadam_config_seq128_nccl.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 100,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitAdam",
8 | "params": {
9 | "lr": 4e-4,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "freeze_step": 23000,
13 | "cuda_aware": false,
14 | "comm_backend_name": "nccl"
15 | }
16 | },
17 | "gradient_clipping": 1.0,
18 |
19 | "wall_clock_breakdown": false,
20 |
21 | "fp16": {
22 | "enabled": true,
23 | "loss_scale": 0,
24 | "initial_scale_power": 16
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/nccl/ds_train_bert_onebitadam_bsz4k_seq128_nccl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script requires pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
5 | # Read the tutorial for more details:
6 | # https://www.deepspeed.ai/tutorials/onebit-adam/
7 |
8 | base_dir=`pwd`
9 |
10 | # Where should we save checkpoints and tensorboard events?
11 | JOB_NAME=onebit_adam_4k_seq128_nccl
12 | OUTPUT_DIR=${base_dir}/bert_model_outputs
13 |
14 | mkdir -p $OUTPUT_DIR
15 |
16 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
17 | NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ${base_dir}/../../deepspeed_train.py \
18 | --cf ${base_dir}/../../bert_large.json \
19 | --max_seq_length 128 \
20 | --output_dir $OUTPUT_DIR \
21 | --deepspeed \
22 | --deepspeed_transformer_kernel \
23 | --print_steps 40 \
24 | --lr_schedule "LE" \
25 | --lr_offset 0.0 \
26 | --job_name $JOB_NAME \
27 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_onebitadam_config_seq128_nccl.json \
28 | --data_path_prefix /data/bert \
29 | &> ${JOB_NAME}.log
30 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_ethernet/deepspeed_bsz32k_onebitlamb_config_seq512_mpi_ethernet.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32768,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitLamb",
8 | "params": {
9 | "lr": 2e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01,
14 | "freeze_step": 6100,
15 | "cuda_aware": false,
16 | "comm_backend_name": "mpi",
17 | "coeff_beta": 0.9,
18 | "factor_max": 4.0,
19 | "factor_min": 0.5,
20 | "factor_threshold": 0.1
21 | }
22 | },
23 | "gradient_clipping": 1.0,
24 |
25 | "wall_clock_breakdown": false,
26 |
27 | "fp16": {
28 | "enabled": true,
29 | "loss_scale": 0
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_ethernet/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_ethernet.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 65536,
3 | "train_micro_batch_size_per_gpu": 64,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitLamb",
8 | "params": {
9 | "lr": 11e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01,
14 | "freeze_step": 1000,
15 | "cuda_aware": false,
16 | "comm_backend_name": "mpi",
17 | "coeff_beta": 0.9,
18 | "factor_max": 4.0,
19 | "factor_min": 0.5,
20 | "factor_threshold": 0.1
21 | }
22 | },
23 | "gradient_clipping": 1.0,
24 |
25 | "wall_clock_breakdown": false,
26 |
27 | "fp16": {
28 | "enabled": true,
29 | "loss_scale": 0,
30 | "initial_scale_power": 16
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_infiniband/deepspeed_bsz32k_onebitlamb_config_seq512_mpi_infiniband.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32768,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitLamb",
8 | "params": {
9 | "lr": 2e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01,
14 | "freeze_step": 6100,
15 | "cuda_aware": true,
16 | "comm_backend_name": "mpi",
17 | "coeff_beta": 0.9,
18 | "factor_max": 4.0,
19 | "factor_min": 0.5,
20 | "factor_threshold": 0.1
21 | }
22 | },
23 | "gradient_clipping": 1.0,
24 |
25 | "wall_clock_breakdown": false,
26 |
27 | "fp16": {
28 | "enabled": true,
29 | "loss_scale": 0
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_infiniband/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 65536,
3 | "train_micro_batch_size_per_gpu": 64,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitLamb",
8 | "params": {
9 | "lr": 11e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01,
14 | "freeze_step": 1000,
15 | "cuda_aware": true,
16 | "comm_backend_name": "mpi",
17 | "coeff_beta": 0.9,
18 | "factor_max": 4.0,
19 | "factor_min": 0.5,
20 | "factor_threshold": 0.1
21 | }
22 | },
23 | "gradient_clipping": 1.0,
24 |
25 | "wall_clock_breakdown": false,
26 |
27 | "fp16": {
28 | "enabled": true,
29 | "loss_scale": 0,
30 | "initial_scale_power": 16
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_infiniband/ds_train_bert_onebitlamb_bsz64k_seq128_mpi_infiniband.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # If you are able to install pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs),
5 | # we highly recommend you to use the NCCL-based 1-bit Lamb
6 | # which has better performance and ease of use
7 | # (see scripts in DeepSpeedExamples/bing_bert/1-bit_lamb/nccl
8 | # and read the tutorial for more details:
9 | # https://www.deepspeed.ai/tutorials/onebit-lamb/)
10 |
11 | base_dir=`pwd`
12 |
13 | # Where should we save checkpoints and tensorboard events?
14 | JOB_NAME=onebit_lamb_64k_seq128_mpi_infiniband
15 | OUTPUT_DIR=${base_dir}/bert_model_outputs
16 |
17 | mkdir -p $OUTPUT_DIR
18 |
19 | NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \
20 | --cf ${base_dir}/../../bert_large_lamb.json \
21 | --max_seq_length 128 \
22 | --output_dir $OUTPUT_DIR \
23 | --deepspeed_mpi \
24 | --deepspeed \
25 | --deepspeed_transformer_kernel \
26 | --print_steps 40 \
27 | --lr_schedule "EE" \
28 | --lr_offset 10e-4 \
29 | --job_name $JOB_NAME \
30 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json \
31 | --data_path_prefix /data/bert \
32 | &> ${JOB_NAME}.log
33 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/nccl/deepspeed_bsz32k_onebitlamb_config_seq512_nccl.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32768,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitLamb",
8 | "params": {
9 | "lr": 2e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01,
14 | "freeze_step": 6100,
15 | "cuda_aware": false,
16 | "comm_backend_name": "nccl",
17 | "coeff_beta": 0.9,
18 | "factor_max": 4.0,
19 | "factor_min": 0.5,
20 | "factor_threshold": 0.1
21 | }
22 | },
23 | "gradient_clipping": 1.0,
24 |
25 | "wall_clock_breakdown": false,
26 |
27 | "fp16": {
28 | "enabled": true,
29 | "loss_scale": 0
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/nccl/deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 65536,
3 | "train_micro_batch_size_per_gpu": 64,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "OneBitLamb",
8 | "params": {
9 | "lr": 11e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01,
14 | "freeze_step": 1000,
15 | "cuda_aware": false,
16 | "comm_backend_name": "nccl",
17 | "coeff_beta": 0.9,
18 | "factor_max": 4.0,
19 | "factor_min": 0.5,
20 | "factor_threshold": 0.1
21 | }
22 | },
23 | "gradient_clipping": 1.0,
24 |
25 | "wall_clock_breakdown": false,
26 |
27 | "fp16": {
28 | "enabled": true,
29 | "loss_scale": 0,
30 | "initial_scale_power": 16
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/nccl/ds_train_bert_onebitlamb_bsz64k_seq128_nccl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script requires pytorch >= 1.8
4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
5 | # Read the tutorial for more details:
6 | # https://www.deepspeed.ai/tutorials/onebit-lamb
7 |
8 | base_dir=`pwd`
9 |
10 | # Where should we save checkpoints and tensorboard events?
11 | JOB_NAME=onebit_lamb_64k_seq128_nccl
12 | OUTPUT_DIR=${base_dir}/bert_model_outputs
13 |
14 | mkdir -p $OUTPUT_DIR
15 |
16 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
17 | NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ${base_dir}/../../deepspeed_train.py \
18 | --cf ${base_dir}/../../bert_large_lamb.json \
19 | --max_seq_length 128 \
20 | --output_dir $OUTPUT_DIR \
21 | --deepspeed \
22 | --deepspeed_transformer_kernel \
23 | --print_steps 40 \
24 | --lr_schedule "EE" \
25 | --lr_offset 10e-4 \
26 | --job_name $JOB_NAME \
27 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json \
28 | --data_path_prefix /data/bert \
29 | --ckpt_to_save 150 \
30 | &> ${JOB_NAME}.log
31 |
--------------------------------------------------------------------------------
/training/bing_bert/bert_dataset_provider.py:
--------------------------------------------------------------------------------
1 | class BertDatasetProviderInterface:
2 | def get_shard(self, index, shuffle=True):
3 | raise NotImplementedError
4 |
5 | def release_shard(self, index):
6 | raise NotImplementedError
7 |
8 | def prefetch_shard(self, index):
9 | raise NotImplementedError
10 |
11 | def get_batch(self, batch_iter):
12 | raise NotImplementedError
13 |
14 | def prefetch_batch(self):
15 | raise NotImplementedError
16 |
--------------------------------------------------------------------------------
/training/bing_bert/data_worker.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import queue
3 | import time
4 |
5 |
6 | class AsyncWorker(threading.Thread):
7 | def __init__(self, dataloaders, dataset_picker):
8 | threading.Thread.__init__(self)
9 | self.req_queue = queue.Queue()
10 | self.ret_queue = queue.Queue()
11 | self.dataloaders = dataloaders
12 | self.dataset_picker = dataset_picker
13 | self.prefetch_idx = 3
14 | for i in range(self.prefetch_idx):
15 | self.req_queue.put(dataset_picker[i])
16 |
17 | def run(self):
18 | while True:
19 | dataset_type = self.req_queue.get(block=True)
20 | if dataset_type is None:
21 | break
22 | batch = next(self.dataloaders[dataset_type])
23 | self.req_queue.task_done()
24 | self.ret_queue.put(batch)
25 |
26 | def get(self):
27 | batch = self.ret_queue.get()
28 | self.ret_queue.task_done()
29 | return batch
30 |
31 | def prefetch(self):
32 | if self.prefetch_idx < len(self.dataset_picker):
33 | self.req_queue.put(self.dataset_picker[self.prefetch_idx])
34 | self.prefetch_idx += 1
35 |
36 | def stop(self):
37 | self.req_queue.put(None)
38 |
--------------------------------------------------------------------------------
/training/bing_bert/deepspeed_bsz32k_lamb_config_seq512.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32768,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "Lamb",
8 | "params": {
9 | "lr": 2e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01
14 | }
15 | },
16 | "gradient_clipping": 1.0,
17 |
18 | "wall_clock_breakdown": false,
19 |
20 | "fp16": {
21 | "enabled": true,
22 | "loss_scale": 0
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/training/bing_bert/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 4096,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": true,
6 | "gradient_predivide_factor": 8,
7 | "optimizer": {
8 | "type": "Adam",
9 | "params": {
10 | "lr": 1e-3,
11 | "weight_decay": 0.01,
12 | "bias_correction": false
13 | }
14 | },
15 | "gradient_clipping": 1.0,
16 | "wall_clock_breakdown": false,
17 | "fp16": {
18 | "enabled": true,
19 | "loss_scale": 0
20 | },
21 | "progressive_layer_drop": {
22 | "enabled": true,
23 | "theta": 0.5,
24 | "gamma": 0.001
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/training/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 65536,
3 | "train_micro_batch_size_per_gpu": 64,
4 | "steps_per_print": 1000,
5 | "prescale_gradients": false,
6 | "optimizer": {
7 | "type": "Lamb",
8 | "params": {
9 | "lr": 11e-3,
10 | "weight_decay": 0.01,
11 | "bias_correction": false,
12 | "max_coeff": 0.3,
13 | "min_coeff": 0.01
14 | }
15 | },
16 | "gradient_clipping": 1.0,
17 |
18 | "wall_clock_breakdown": false,
19 |
20 | "fp16": {
21 | "enabled": true,
22 | "loss_scale": 0
23 | },
24 | "sparse_attention": {
25 | "mode": "fixed",
26 | "block": 16,
27 | "different_layout_per_head": true,
28 | "num_local_blocks": 4,
29 | "num_global_blocks": 1,
30 | "attention": "bidirectional",
31 | "horizontal_global_attention": false,
32 | "num_different_global_patterns": 4
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/training/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script runs deepspeed using sparse attention for BertEncoderLayer.
4 |
5 | base_dir=`pwd`
6 |
7 | # Where should we save checkpoints and tensorboard events?
8 | JOB_NAME=lamb_64k_seq128
9 | OUTPUT_DIR=${base_dir}/bert_model_outputs
10 |
11 | mkdir -p $OUTPUT_DIR
12 |
13 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
14 | --cf ${base_dir}/bert_large_lamb.json \
15 | --max_seq_length 128 \
16 | --output_dir $OUTPUT_DIR \
17 | --deepspeed \
18 | --deepspeed_sparse_attention \
19 | --print_steps 100 \
20 | --lr_schedule "EE" \
21 | --lr_offset 10e-4 \
22 | --job_name $JOB_NAME \
23 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
24 | --data_path_prefix /data/bert \
25 | &> ${JOB_NAME}.log
26 |
--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_bsz32k_seq512.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | base_dir=`pwd`
4 |
5 | # Where should we save checkpoints and tensorboard events?
6 | JOB_NAME=lamb_32k_chkpt150_seq512
7 | OUTPUT_DIR=${base_dir}/bert_model_outputs
8 |
9 | # Assumes job name in previous seq128 run, will resume training from epoch 150
10 | CHECKPOINT_BASE_PATH=${OUTPUT_DIR}/saved_models/lamb_64k_seq128
11 | CHECKPOINT_EPOCH150_NAME=`basename ${CHECKPOINT_BASE_PATH}/epoch150_*`
12 | echo "checkpoint id: $CHECKPOINT_EPOCH150_NAME"
13 |
14 | mkdir -p $OUTPUT_DIR
15 |
16 | deepspeed ${base_dir}/deepspeed_train.py \
17 | --cf ${base_dir}/bert_large_lamb.json \
18 | --max_seq_length 512 \
19 | --output_dir $OUTPUT_DIR \
20 | --print_steps 100 \
21 | --deepspeed \
22 | --deepspeed_transformer_kernel \
23 | --job_name $JOB_NAME \
24 | --deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \
25 | --data_path_prefix /data/bert \
26 | --validation_data_path_prefix /data/bert \
27 | --rewarmup \
28 | --lr_schedule "EE" \
29 | --attention_dropout_checkpoint \
30 | --lr_offset 0.0 \
31 | --load_training_checkpoint ${CHECKPOINT_BASE_PATH} \
32 | --load_checkpoint_id ${CHECKPOINT_EPOCH150_NAME} \
33 | &> ${JOB_NAME}.log
34 |
--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_bsz64k_seq128.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | base_dir=`pwd`
4 |
5 | # Where should we save checkpoints and tensorboard events?
6 | JOB_NAME=lamb_64k_seq128
7 | OUTPUT_DIR=${base_dir}/bert_model_outputs
8 |
9 | mkdir -p $OUTPUT_DIR
10 |
11 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
12 | --cf ${base_dir}/bert_large_lamb.json \
13 | --max_seq_length 128 \
14 | --output_dir $OUTPUT_DIR \
15 | --deepspeed \
16 | --deepspeed_transformer_kernel \
17 | --print_steps 100 \
18 | --lr_schedule "EE" \
19 | --lr_offset 10e-4 \
20 | --job_name $JOB_NAME \
21 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
22 | --data_path_prefix /data/bert \
23 | &> ${JOB_NAME}.log
24 |
--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_nvidia_data_bsz32k_seq512.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ -z $1 ]]; then
4 | LOAD_EPOCH=16
5 | else
6 | LOAD_EPOCH=$1
7 | fi
8 | base_dir=`pwd`
9 |
10 | # Where should we save checkpoints and tensorboard events?
11 | JOB_NAME=lamb_nvidia_data_32k_chkpt${LOAD_EPOCH}_seq512
12 | OUTPUT_DIR=${base_dir}/bert_model_nvidia_data_outputs
13 |
14 | # Assumes job name in previous seq128 run, will resume training from epoch 18 by default
15 | CHECKPOINT_BASE_PATH=${OUTPUT_DIR}/saved_models/lamb_nvidia_data_64k_seq128
16 | CHECKPOINT_EPOCH_NAME=`basename ${CHECKPOINT_BASE_PATH}/epoch${LOAD_EPOCH}_*`
17 | echo "checkpoint id: $CHECKPOINT_EPOCH_NAME"
18 |
19 | mkdir -p $OUTPUT_DIR
20 |
21 | deepspeed ${base_dir}/deepspeed_train.py \
22 | --cf ${base_dir}/bert_large_lamb_nvidia_data.json \
23 | --max_seq_length 512 \
24 | --output_dir $OUTPUT_DIR \
25 | --print_steps 1 \
26 | --deepspeed \
27 | --deepspeed_transformer_kernel \
28 | --job_name $JOB_NAME \
29 | --deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \
30 | --data_path_prefix /workspace/bert \
31 | --use_nvidia_dataset \
32 | --rewarmup \
33 | --lr_schedule "EE" \
34 | --attention_dropout_checkpoint \
35 | --lr_offset 0.0 \
36 | --load_training_checkpoint ${CHECKPOINT_BASE_PATH} \
37 | --load_checkpoint_id ${CHECKPOINT_EPOCH_NAME} \
38 | &> ${JOB_NAME}.log
39 |
--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_nvidia_data_bsz64k_seq128.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | base_dir=`pwd`
4 |
5 | # Where should we save checkpoints and tensorboard events?
6 | JOB_NAME=lamb_nvidia_data_64k_seq128
7 | OUTPUT_DIR=${base_dir}/bert_model_nvidia_data_outputs
8 |
9 | mkdir -p $OUTPUT_DIR
10 |
11 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
12 | --cf ${base_dir}/bert_large_lamb_nvidia_data.json \
13 | --max_seq_length 128 \
14 | --output_dir $OUTPUT_DIR \
15 | --deepspeed \
16 | --deepspeed_transformer_kernel \
17 | --print_steps 100 \
18 | --lr_schedule "EE" \
19 | --lr_offset 10e-4 \
20 | --job_name $JOB_NAME \
21 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
22 | --data_path_prefix /workspace/bert \
23 | --use_nvidia_dataset \
24 | &> ${JOB_NAME}.log
25 |
--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | base_dir=`pwd`
4 |
5 | # Where should we save checkpoints and tensorboard events?
6 | JOB_NAME=adam_4k_seq128_progressive_layer_drop
7 | OUTPUT_DIR=${base_dir}/bert_model_outputs
8 |
9 | mkdir -p $OUTPUT_DIR
10 |
11 | config="--progressive_layer_drop"
12 |
13 | NCCL_TREE_THRESHOLD=0 deepspeed \
14 | ${base_dir}/deepspeed_train.py \
15 | --cf ${base_dir}/bert_base_large_lr.json \
16 | --max_seq_length 128 \
17 | --output_dir $OUTPUT_DIR \
18 | --deepspeed \
19 | --print_steps 100 \
20 | --lr_schedule "LE" \
21 | --job_name $JOB_NAME \
22 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json \
23 | --data_path_prefix /data/bert \
24 | ${config} \
25 | &> ${JOB_NAME}.log
26 |
--------------------------------------------------------------------------------
/training/bing_bert/glue_bert_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32,
3 | "train_micro_batch_size_per_gpu": 32,
4 | "steps_per_print": 10,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 3e-5,
9 | "weight_decay": 0.0,
10 | "bias_correction": false
11 | }
12 | },
13 | "gradient_clipping": 1.0,
14 | "fp16": {
15 | "enabled": true
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/training/bing_bert/glue_bert_large.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 32,
3 | "train_micro_batch_size_per_gpu": 4,
4 | "steps_per_print": 10,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 3e-5,
9 | "weight_decay": 0.0,
10 | "bias_correction": false
11 | }
12 | },
13 | "gradient_clipping": 1.0,
14 | "fp16": {
15 | "enabled": true
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/training/bing_bert/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 | BertForMaskedLM, BertForNextSentencePrediction,
5 | BertForSequenceClassification, BertForMultipleChoice,
6 | BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 |
--------------------------------------------------------------------------------
/training/bing_bert/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | def main():
3 | import sys
4 | try:
5 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
6 | except ModuleNotFoundError:
7 | print(
8 | "pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
9 | "In that case, it requires TensorFlow to be installed. Please see "
10 | "https://www.tensorflow.org/install/ for installation instructions."
11 | )
12 | raise
13 |
14 | if len(sys.argv) != 5:
15 | # pylint: disable=line-too-long
16 | print(
17 | "Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`"
18 | )
19 | else:
20 | PYTORCH_DUMP_OUTPUT = sys.argv.pop()
21 | TF_CONFIG = sys.argv.pop()
22 | TF_CHECKPOINT = sys.argv.pop()
23 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG,
24 | PYTORCH_DUMP_OUTPUT)
25 |
26 |
27 | if __name__ == '__main__':
28 | main()
29 |
--------------------------------------------------------------------------------
/training/bing_bert/requirements.txt:
--------------------------------------------------------------------------------
1 | sklearn
2 |
--------------------------------------------------------------------------------
/training/bing_bert/turing/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch.distributed as dist
3 |
4 | logging.basicConfig(
5 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
6 | datefmt='%m/%d/%Y %H:%M:%S',
7 | level=logging.INFO)
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class Logger():
12 | def __init__(self, cuda=False):
13 | self.logger = logging.getLogger(__name__)
14 | self.cuda = cuda
15 |
16 | def info(self, message, *args, **kwargs):
17 | if (self.cuda and dist.get_rank() == 0) or not self.cuda:
18 | self.logger.info(message, *args, **kwargs)
19 |
20 | def error(self, message, *args, **kwargs):
21 | self.logger.error(message, *args, **kwargs)
22 |
--------------------------------------------------------------------------------
/training/bing_bert/turing/text.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | PAD = 0
4 |
5 |
6 | def mask(x):
7 | return x != PAD
8 |
9 |
10 | def torch_long(x):
11 | return torch.LongTensor(x)
12 |
--------------------------------------------------------------------------------
/training/cifar/README.md:
--------------------------------------------------------------------------------
1 | Thanks Gopi Kumar for contributing this example, demonstrating how to apply DeepSpeed to CIFAR-10 model.
2 |
3 | `cifar10_tutorial.py`
4 | Baseline CIFAR-10 model.
5 |
6 | `cifar10_deepspeed.py`
7 | DeepSpeed applied CIFAR-10 model.
8 |
9 | `run_ds.sh`
10 | Script for running DeepSpeed applied model.
11 |
12 | `run_ds_moe.sh`
13 | Script for running DeepSpeed model with Mixture of Experts (MoE) integration.
14 |
15 | `run_ds_prmoe.sh`
16 | Script for running DeepSpeed model with Pyramid Residual MoE (PR-MoE) integration.
17 |
18 | * To run baseline CIFAR-10 model - `python cifar10_tutorial.py`
19 | * To run DeepSpeed CIFAR-10 model - `bash run_ds.sh`
20 | * To run DeepSpeed CIFAR-10 model with Mixture of Experts (MoE) - `bash run_ds_moe.sh`
21 | * To run DeepSpeed CIFAR-10 model with Pyramid Residual MoE (PR-MoE) - `bash run_ds_prmoe.sh`
22 | * To run with different data type (default=`fp16`) and zero stages (default=`0`) - `bash run_ds.sh --dtype={fp16|bf16} --stage={0|1|2|3}`
23 |
--------------------------------------------------------------------------------
/training/cifar/requirements.txt:
--------------------------------------------------------------------------------
1 | torchvision==0.4.0
2 | pillow>=7.1.0
3 | matplotlib
4 |
--------------------------------------------------------------------------------
/training/cifar/run_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed --bind_cores_to_rank cifar10_deepspeed.py --deepspeed $@
4 |
--------------------------------------------------------------------------------
/training/cifar/run_ds_moe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Number of nodes
4 | NUM_NODES=1
5 | # Number of GPUs per node
6 | NUM_GPUS=2
7 | # Size of expert parallel world (should be less than total world size)
8 | EP_SIZE=2
9 | # Number of total experts
10 | EXPERTS=2
11 |
12 | deepspeed --num_nodes=${NUM_NODES}\
13 | --num_gpus=${NUM_GPUS} \
14 | --bind_cores_to_rank \
15 | cifar10_deepspeed.py \
16 | --log-interval 100 \
17 | --deepspeed \
18 | --moe \
19 | --ep-world-size ${EP_SIZE} \
20 | --num-experts ${EXPERTS} \
21 | --top-k 1 \
22 | --noisy-gate-policy 'RSample' \
23 | --moe-param-group
24 |
--------------------------------------------------------------------------------
/training/cifar/run_ds_prmoe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Number of nodes
4 | NUM_NODES=1
5 | # Number of GPUs per node
6 | NUM_GPUS=2
7 | # Size of expert parallel world (should be less than total world size)
8 | EP_SIZE=2
9 | # Number of total experts, note here we need to pass >= two numbers (numbers can be different)
10 | EXPERTS='2 4'
11 |
12 | deepspeed --num_nodes=${NUM_NODES} --num_gpus=${NUM_GPUS} cifar10_deepspeed.py \
13 | --log-interval 100 \
14 | --deepspeed \
15 | --moe \
16 | --ep-world-size ${EP_SIZE} \
17 | --num-experts ${EXPERTS} \
18 | --top-k 1 \
19 | --mlp-type 'residual' \
20 | --noisy-gate-policy 'RSample' \
21 | --moe-param-group
22 |
--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/bash_script/run_medium_random_ltd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ##################apply random-ltd to fine-tune ptb on GPT-medium (24-layer)##############################
3 | ####see more on random-ltd: https://arxiv.org/abs/2211.11586
4 | export CUDA_VISIBLE_DEVICES=2
5 | mkdir -p ./output/check_medium
6 | python -m torch.distributed.launch --nproc_per_node=1 \
7 | --master_port 12345 \
8 | run_clm_no_trainer.py \
9 | --random_ltd \
10 | --dataset_name ptb_text_only \
11 | --dataset_config_name penn_treebank \
12 | --model_name_or_path gpt2-medium \
13 | --per_device_train_batch_size 2 \
14 | --per_device_eval_batch_size 2 \
15 | --num_train_epochs 2 \
16 | --deepspeed_config config/ds_config_gpt_medium_random_ltd.json \
17 | --deepspeed --seed 1234 --num_warmup_steps 100 \
18 | --output_dir ./output/check_medium &> ./output/check_medium/training.log
--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/config/ds_config_gpt_base_random_ltd.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 4,
3 | "train_micro_batch_size_per_gpu": 2,
4 | "steps_per_print": 2,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 0.0001,
9 | "betas": [0.8,0.999],
10 | "eps": 1e-8,
11 | "weight_decay": 3e-7
12 | }
13 | },
14 | "zero_optimization": {
15 | "stage": 0
16 | },
17 | "fp16":{
18 | "enabled": false
19 | },
20 | "gradient_clipping": 1.0,
21 | "prescale_gradients": true,
22 | "wall_clock_breakdown" : false,
23 | "data_efficiency": {
24 | "enabled": true,
25 | "data_routing": {
26 | "enabled": true,
27 | "random_ltd":{
28 | "enabled": true,
29 | "total_layer_num": 12,
30 | "random_ltd_layer_num": 10,
31 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
32 | "model_mask_name": "attention_mask",
33 | "model_type": "decoder",
34 | "hidden_state_order": "batch_seq_dim",
35 | "random_ltd_schedule": {
36 | "min_value": 128,
37 | "max_value": 1024,
38 | "schedule_type": "fixed_linear",
39 | "schedule_config": {
40 | "require_steps": 400,
41 | "seq_per_step": 8
42 | }
43 | }
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_reduce.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set these 2 to the same as what you used during map job. We need these 2
4 | # configs to know how many map job result files do we have.
5 | num_workers=1
6 | num_threads=1
7 | # Reduce job only has 1 worker but can accelerate by multithreading.
8 | num_threads_reduce=1
9 |
10 | save_path="/blob/users/conglli/data/analysis_ptb_gpt/"
11 |
12 | metric='total_vocab_freq'
13 | # metric='vocab_rarity' # this requires the result of total_vocab_freq
14 |
15 | dataset_name="ptb_text_only"
16 | dataset_config_name="penn_treebank"
17 | model_name_or_path="gpt2-medium"
18 |
19 | batch_size=1000
20 |
21 | jobname="gpt-ptb-analyzing-${metric}-reduce"
22 |
23 | options=" \
24 | --analyzing_task reduce \
25 | --analyzing_metric ${metric} \
26 | --analyzing_num_workers ${num_workers} \
27 | --analyzing_num_threads ${num_threads} \
28 | --analyzing_num_threads_reduce ${num_threads_reduce} \
29 | --dataset_name ${dataset_name} \
30 | --dataset_config_name ${dataset_config_name} \
31 | --model_name_or_path ${model_name_or_path} \
32 | --per_device_train_batch_size ${batch_size} \
33 | --output_dir ${save_path}"
34 |
35 | python ../analyze_data.py ${options} &> ${jobname}.log
--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/finetune/ds_config_gpt2_TEMPLATE.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : GB_SIZE,
3 | "train_micro_batch_size_per_gpu": MB_SIZE,
4 | "steps_per_print": 10,
5 | "zero_optimization": {
6 | "stage": 0
7 | },
8 | "fp16":{
9 | "enabled": false
10 | },
11 | "gradient_clipping": 1.0,
12 | "prescale_gradients": true,
13 | "wall_clock_breakdown" : false,
14 | "data_efficiency": {
15 | "enabled": true,
16 | "data_routing": {
17 | "enabled": LTD_ENABLED,
18 | "random_ltd":{
19 | "enabled": LTD_ENABLED,
20 | "total_layer_num": 12,
21 | "random_ltd_layer_num": 10,
22 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
23 | "model_mask_name": "attention_mask",
24 | "model_type": "decoder",
25 | "hidden_state_order": "batch_seq_dim",
26 | "random_ltd_schedule": {
27 | "min_value": LTD_MIN,
28 | "max_value": 1024,
29 | "schedule_type": "fixed_linear",
30 | "schedule_config": {
31 | "require_steps": LTD_STEP,
32 | "seq_per_step": 8
33 | }
34 | }
35 | }
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/requirement.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.8.0
2 | sentencepiece != 0.1.92
3 | protobuf
4 | transformers == 4.15.0
5 | accelerate
--------------------------------------------------------------------------------
/training/data_efficiency/variable_batch_size_and_lr/variable_attn_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_attn_matrix.png
--------------------------------------------------------------------------------
/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr.png
--------------------------------------------------------------------------------
/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr_pipeline.png
--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/bash_script/run_cifar_random_ltd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export CUDA_VISIBLE_DEVICES=0
4 | mkdir -p out/cifar/
5 | # deepspeed --include worker-0:0 --master_port 60000 main_cifar.py \
6 | # --deepspeed_config config/ds_config.json \
7 | # --deepspeed --random_ltd \
8 | # --dataset cifar10vit224 \
9 | # --seed 1234 \
10 | # --printfreq 400 \
11 | # --arch lvits16r224 \
12 | # --optimizer sgd \
13 | # --lr 0.0001 --seq_len 197 \
14 | # --scheduler constant \
15 | # --epochs 14 \
16 | # --batchsize 32 \
17 | # --data_outdir check/cifar/ | tee -a check/cifar/training.log
18 |
19 | deepspeed --num_nodes 1 --num_gpus 1 --master_port 60000 main_cifar.py \
20 | --deepspeed_config config/ds_config_cifar_random_ltd.json \
21 | --deepspeed --random_ltd \
22 | --dataset cifar10vit224 \
23 | --seed 1234 \
24 | --printfreq 400 \
25 | --arch vits16r224 \
26 | --optimizer sgd \
27 | --lr 0.0001 --seq_len 197 \
28 | --scheduler constant \
29 | --epochs 14 \
30 | --batchsize 128 \
31 | --data_outdir out/cifar/ | tee -a out/cifar/training1.log
--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/config/ds_config_cifar_random_ltd.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 32,
3 | "train_micro_batch_size_per_gpu": 32,
4 | "steps_per_print": 200,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 0.0001,
9 | "betas": [0.8,0.999],
10 | "eps": 1e-8,
11 | "weight_decay": 3e-7
12 | }
13 | },
14 | "zero_optimization": {
15 | "stage": 0
16 | },
17 | "fp16":{
18 | "enabled": false
19 | },
20 | "gradient_clipping": 1.0,
21 | "prescale_gradients": true,
22 | "wall_clock_breakdown" : false,
23 | "data_efficiency": {
24 | "enabled": true,
25 | "data_routing": {
26 | "enabled": true,
27 | "random_ltd":{
28 | "enabled": true,
29 | "total_layer_num": 12,
30 | "random_ltd_layer_num": 10,
31 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
32 | "model_mask_name": null,
33 | "model_type": "decoder",
34 | "hidden_state_order": "batch_seq_dim",
35 | "random_ltd_schedule": {
36 | "min_value": 32,
37 | "max_value": 197,
38 | "schedule_type":"fixed_linear",
39 | "schedule_config": {
40 | "require_steps": 3910,
41 | "seq_per_step": 8
42 | }
43 | }
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/config/ds_config_imagenet_random_ltd.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 256,
3 | "train_micro_batch_size_per_gpu": 16,
4 | "steps_per_print": 200,
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 0.0001,
9 | "betas": [0.8,0.999],
10 | "eps": 1e-8,
11 | "weight_decay": 3e-7
12 | }
13 | },
14 | "zero_optimization": {
15 | "stage": 0
16 | },
17 | "fp16":{
18 | "enabled": false
19 | },
20 | "gradient_clipping": 1.0,
21 | "prescale_gradients": true,
22 | "wall_clock_breakdown" : false,
23 | "data_efficiency": {
24 | "enabled": true,
25 | "data_routing": {
26 | "enabled": true,
27 | "random_ltd":{
28 | "enabled": true,
29 | "total_layer_num": 12,
30 | "random_ltd_layer_num": 10,
31 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
32 | "model_mask_name": null,
33 | "model_type": "decoder",
34 | "hidden_state_order": "batch_seq_dim",
35 | "random_ltd_schedule": {
36 | "min_value": 32,
37 | "max_value": 197,
38 | "schedule_type":"fixed_linear",
39 | "schedule_config": {
40 | "require_steps": 3910,
41 | "seq_per_step": 8
42 | }
43 | }
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import absolute_import
16 | from .vit import *
17 | from .vit import Block
18 |
--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/requirement.txt:
--------------------------------------------------------------------------------
1 | timm==0.6.5
2 | torch>1.10.0
3 | torchvision>0.11.1
4 | mpi4py
5 |
--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .utils import get_model, get_optimizer, get_scheduler, LossTracker, AverageMeter, ProgressMeter, accuracy,run_cmd
16 | from .get_data import get_dataset
17 |
18 | __all__ = [ "get_dataset", "ImageMemFolder", "AverageMeter", "ProgressMeter", "accuracy", "get_optimizer", "get_scheduler", "get_model", "LossTracker","run_cmd"]
19 |
--------------------------------------------------------------------------------
/training/gan/gan_baseline_run.sh:
--------------------------------------------------------------------------------
1 | python gan_baseline_train.py --dataset celeba --cuda --tensorboard_path './runs/baseline'
2 |
--------------------------------------------------------------------------------
/training/gan/gan_deepspeed_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 64,
3 | "optimizer": {
4 | "type": "Adam",
5 | "params": {
6 | "lr": 0.0002,
7 | "betas": [
8 | 0.5,
9 | 0.999
10 | ],
11 | "eps": 1e-8
12 | }
13 | },
14 | "steps_per_print" : 10
15 | }
16 |
--------------------------------------------------------------------------------
/training/gan/gan_deepspeed_run.sh:
--------------------------------------------------------------------------------
1 | deepspeed gan_deepspeed_train.py --dataset celeba --cuda --deepspeed_config gan_deepspeed_config.json --tensorboard_path './runs/deepspeed'
2 |
--------------------------------------------------------------------------------
/training/imagenet/assets/resnetplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/imagenet/assets/resnetplot.png
--------------------------------------------------------------------------------
/training/imagenet/config/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 256,
3 | "gradient_accumulation_steps": 1,
4 | "steps_per_print": 50,
5 |
6 | "optimizer": {
7 | "type": "Adam",
8 | "params": {
9 | "lr": 0.001,
10 | "betas": [
11 | 0.8,
12 | 0.999
13 | ],
14 | "eps": 1e-8,
15 | "weight_decay": 3e-7
16 | }
17 | },
18 |
19 | "zero_optimization": {
20 | "stage": 0
21 | },
22 | "zero_allow_untested_optimizer": true,
23 | "fp16": {
24 | "enabled": false
25 | },
26 | "gradient_clipping": 0,
27 | "prescale_gradients": false,
28 | "cuda_visible_devices": 0,
29 | "wall_clock_breakdown" : false
30 | }
31 |
--------------------------------------------------------------------------------
/training/imagenet/config/ds_fp16_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 256,
3 | "gradient_accumulation_steps": 1,
4 | "steps_per_print": 50,
5 |
6 | "optimizer": {
7 | "type": "Adam",
8 | "params": {
9 | "lr": 0.001,
10 | "betas": [
11 | 0.8,
12 | 0.999
13 | ],
14 | "eps": 1e-8,
15 | "weight_decay": 3e-7
16 | }
17 | },
18 |
19 | "zero_optimization": {
20 | "stage": 0
21 | },
22 | "zero_allow_untested_optimizer": true,
23 | "fp16": {
24 | "enabled": true,
25 | "auto_cast": true
26 | },
27 | "gradient_clipping": 0,
28 | "prescale_gradients": false,
29 | "cuda_visible_devices": 0,
30 | "wall_clock_breakdown" : false
31 | }
32 |
--------------------------------------------------------------------------------
/training/imagenet/config/ds_fp16_z1_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": 256,
3 | "gradient_accumulation_steps": 1,
4 | "steps_per_print": 50,
5 |
6 | "optimizer": {
7 | "type": "Adam",
8 | "params": {
9 | "lr": 0.001,
10 | "betas": [
11 | 0.8,
12 | 0.999
13 | ],
14 | "eps": 1e-8,
15 | "weight_decay": 3e-7
16 | }
17 | },
18 |
19 | "zero_optimization": {
20 | "stage": 1
21 | },
22 | "zero_allow_untested_optimizer": true,
23 | "fp16": {
24 | "enabled": true,
25 | "auto_cast": true
26 | },
27 | "gradient_clipping": 0,
28 | "prescale_gradients": false,
29 | "cuda_visible_devices": 0,
30 | "wall_clock_breakdown" : false
31 | }
32 |
--------------------------------------------------------------------------------
/training/imagenet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 |
--------------------------------------------------------------------------------
/training/imagenet/run_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet
4 |
--------------------------------------------------------------------------------
/training/imagenet/run_ds_fp16.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet
4 |
--------------------------------------------------------------------------------
/training/imagenet/run_ds_fp16_z1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_z1_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet
4 |
--------------------------------------------------------------------------------
/training/megatron/README.md:
--------------------------------------------------------------------------------
1 | # Not maintained / deprecated
2 |
3 | > __Warning__
4 | > all future/current changes are now in new [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed).
5 |
--------------------------------------------------------------------------------
/training/offload_states/output_table.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pytablewriter import MarkdownTableWriter
3 |
4 |
5 | def read_csv(file_path):
6 | return pd.read_csv(file_path)
7 |
8 | df = read_csv('offload_states.log')
9 | df.columns = ['pin_memory', 'non_blocking', 'offload_time', 'load_time']
10 |
11 | df['ratio_string'] = df['offload_time'].round(2).astype(str) + " / " + df['load_time'].round(2).astype(str)
12 |
13 | result_df = pd.DataFrame({
14 | 'pin_memory=0_non_blocking=0': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True),
15 | 'pin_memory=0_non_blocking=1': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True),
16 | 'pin_memory=1_non_blocking=0': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True),
17 | 'pin_memory=1_non_blocking=1': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True)
18 | })
19 | result_df = result_df.dropna()
20 | result_df.index = range(1, len(result_df) + 1)
21 | result_df.index.name = 'trial'
22 | # print(result_df)
23 |
24 | writer = MarkdownTableWriter()
25 | writer.from_dataframe(result_df,
26 | add_index_column=True,
27 | )
28 | writer.write_table()
--------------------------------------------------------------------------------
/training/offload_states/run_benchmark.sh:
--------------------------------------------------------------------------------
1 | NGPUS=4
2 | HIDDEN_SIZE=32768
3 | NUM_LAYERS=4
4 |
5 | TRIALS=10
6 |
7 | PIN_MEMORY_OPTS=(0 1)
8 | NON_BLOCKING_OPTS=(0 1)
9 |
10 | for i in $(seq 1 $TRIALS); do
11 | for PIN_MEMORY in "${PIN_MEMORY_OPTS[@]}"; do
12 | PIN_MEMORY_ARG=""
13 | if [ $PIN_MEMORY -eq 1 ]; then
14 | PIN_MEMORY_ARG="--pin_memory"
15 | fi
16 |
17 | for NON_BLOCKING in "${NON_BLOCKING_OPTS[@]}"; do
18 | NON_BLOCKING_ARG=""
19 | if [ $NON_BLOCKING -eq 1 ]; then
20 | NON_BLOCKING_ARG="--non_blocking"
21 | fi
22 |
23 | echo "Running iteration $i"
24 | deepspeed --num_gpus=$NGPUS offload_states.py --hidden_dim $HIDDEN_SIZE --nlayers $NUM_LAYERS $PIN_MEMORY_ARG $NON_BLOCKING_ARG
25 | done
26 | done
27 | done
28 | python output_table.py
29 |
--------------------------------------------------------------------------------
/training/pipeline_parallelism/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size" : 256,
3 | "train_micro_batch_size_per_gpu" : 8,
4 |
5 | "optimizer": {
6 | "type": "Adam",
7 | "params": {
8 | "lr": 0.001,
9 | "betas": [
10 | 0.9,
11 | 0.999
12 | ],
13 | "eps": 1e-8
14 | }
15 | },
16 |
17 | "steps_per_print" : 10,
18 | "wall_clock_breakdown" : false
19 | }
20 |
--------------------------------------------------------------------------------
/training/pipeline_parallelism/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed train.py --deepspeed_config=ds_config.json -p 2 --steps=200
4 |
--------------------------------------------------------------------------------
/training/stable_diffusion/mytrainbash.sh:
--------------------------------------------------------------------------------
1 | export MODEL_NAME="stabilityai/stable-diffusion-2-1-base"
2 | export OUTPUT_DIR="./sd-distill-v21"
3 |
4 | if [ ! -d "$OUTPUT_DIR" ]; then
5 | mkdir "$OUTPUT_DIR"
6 | echo "Folder '$OUTPUT_DIR' created"
7 | else
8 | echo "Folder '$OUTPUT_DIR' already exists"
9 | fi
10 |
11 |
12 | accelerate launch train_sd_distil_lora.py \
13 | --pretrained_model_name_or_path=$MODEL_NAME \
14 | --output_dir=$OUTPUT_DIR \
15 | --default_prompt="A man dancing" \
16 | --resolution=512 \
17 | --train_batch_size=1 \
18 | --gradient_accumulation_steps=1 \
19 | --learning_rate=5e-6 \
20 | --lr_scheduler="constant" \
21 | --lr_warmup_steps=0
22 |
--------------------------------------------------------------------------------
/training/stable_diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/training/tensor_parallel/README.md:
--------------------------------------------------------------------------------
1 | # tensor parallel example
2 | This project is adapted from https://github.com/tatsu-lab/stanford_alpaca.
3 | We only modified the ds_config to enable tensor parallelism and more detailed logging, as an example use case.
4 |
5 | **Script**
6 |
7 | ``` bash run.sh ``` or ```bash run.sh MODE```
8 |
9 |
10 |
--------------------------------------------------------------------------------
/training/tensor_parallel/configs/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupDecayLR",
16 | "params": {
17 | "total_num_steps": "auto",
18 | "warmup_min_lr": "auto",
19 | "warmup_max_lr": "auto",
20 | "warmup_num_steps": "auto"
21 | }
22 | },
23 | "zero_optimization": {
24 | "stage": 1,
25 | "gather_16bit_weights_on_model_save": true
26 | },
27 | "tensor_parallel":{
28 | "autotp_size": 4
29 | },
30 | "gradient_accumulation_steps": "auto",
31 | "gradient_clipping": "auto",
32 | "steps_per_print": 1,
33 | "train_batch_size": "auto",
34 | "train_micro_batch_size_per_gpu": "auto",
35 | "wall_clock_breakdown": false
36 | }
--------------------------------------------------------------------------------
/training/tensor_parallel/configs/ds_config_temp.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupDecayLR",
16 | "params": {
17 | "total_num_steps": "auto",
18 | "warmup_min_lr": "auto",
19 | "warmup_max_lr": "auto",
20 | "warmup_num_steps": "auto"
21 | }
22 | },
23 | "zero_optimization": {
24 | "stage": ${zero_stage},
25 | "gather_16bit_weights_on_model_save": true
26 | },
27 | "tensor_parallel":{
28 | "autotp_size": ${autotp_size}
29 | },
30 | "gradient_accumulation_steps": "auto",
31 | "gradient_clipping": "auto",
32 | "steps_per_print": 1,
33 | "train_batch_size": "auto",
34 | "train_micro_batch_size_per_gpu": "auto",
35 | "wall_clock_breakdown": false
36 | }
--------------------------------------------------------------------------------
/training/tensor_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.50.1
2 | deepspeed>=0.16.4
3 | accelerate==1.6.0
4 | numpy
5 | rouge_score
6 | fire
7 | openai==0.28.0
8 | torch
9 | sentencepiece
10 | tokenizers>=0.13.3
11 |
--------------------------------------------------------------------------------