├── .github
    └── workflows
    │   └── formatting.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── applications
    ├── DeepSpeed-Chat
    │   ├── .gitignore
    │   ├── README.md
    │   ├── assets
    │   │   ├── image
    │   │   │   ├── 1.3B-breakdown.png
    │   │   │   ├── Banner-benchmark.png
    │   │   │   ├── RLHF.png
    │   │   │   ├── democrat2.png
    │   │   │   ├── ds-chat-single.gif
    │   │   │   ├── ds-chat.gif
    │   │   │   ├── ds-shiba.png
    │   │   │   ├── e2e_RLHF.png
    │   │   │   ├── four_blocks.png
    │   │   │   ├── ppo_trainer.png
    │   │   │   ├── reward_function.png
    │   │   │   └── shiba.png
    │   │   └── video
    │   │   │   └── release_v3.mp4
    │   ├── chat.py
    │   ├── dschat
    │   │   ├── rlhf
    │   │   │   ├── ppo_trainer.py
    │   │   │   └── rlhf_engine.py
    │   │   └── utils
    │   │   │   ├── data
    │   │   │       ├── data_utils.py
    │   │   │       └── raw_datasets.py
    │   │   │   ├── ds_utils.py
    │   │   │   ├── model
    │   │   │       ├── model_utils.py
    │   │   │       └── reward_model.py
    │   │   │   ├── module
    │   │   │       └── lora.py
    │   │   │   ├── perf.py
    │   │   │   └── utils.py
    │   ├── e2e_rlhf.py
    │   ├── inference
    │   │   └── chatbot.py
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── tests
    │   │   └── test_training.py
    │   └── training
    │   │   ├── README.md
    │   │   ├── step1_supervised_finetuning
    │   │       ├── README.md
    │   │       ├── evaluation_scripts
    │   │       │   └── run_prompt.sh
    │   │       ├── main.py
    │   │       ├── prompt_eval.py
    │   │       ├── training_log_output
    │   │       │   └── opt-1.3b-globalBatchSize128.log
    │   │       └── training_scripts
    │   │       │   ├── README.md
    │   │       │   ├── llama2
    │   │       │       ├── run_llama2_7b.sh
    │   │       │       └── run_llama2_7b_lora.sh
    │   │       │   ├── opt
    │   │       │       ├── multi_node
    │   │       │       │   └── run_66b.sh
    │   │       │       ├── single_gpu
    │   │       │       │   ├── run_1.3b.sh
    │   │       │       │   └── run_6.7b_lora.sh
    │   │       │       └── single_node
    │   │       │       │   ├── run_1.3b.sh
    │   │       │       │   ├── run_1.3b_lora.sh
    │   │       │       │   ├── run_13b.sh
    │   │       │       │   ├── run_30b_lora.sh
    │   │       │       │   ├── run_6.7b.sh
    │   │       │       │   └── sweep
    │   │       │       │       ├── README.md
    │   │       │       │       ├── run_single.sh
    │   │       │       │       └── run_step1_sweep.sh
    │   │       │   └── other_language
    │   │       │       ├── run_chinese.sh
    │   │       │       └── run_japanese.sh
    │   │   ├── step2_dpo_finetuning
    │   │       ├── README.md
    │   │       ├── main.py
    │   │       ├── training_log_output
    │   │       │   └── opt-350M_globalBatchSize-32.log
    │   │       └── training_scripts
    │   │       │   ├── README.md
    │   │       │   ├── llama2
    │   │       │       ├── run_llama2_7b.sh
    │   │       │       └── run_llama2_7b_lora.sh
    │   │       │   └── opt
    │   │       │       ├── multi_node
    │   │       │           └── run_350m.sh
    │   │       │       ├── single_gpu
    │   │       │           └── run_350m.sh
    │   │       │       └── single_node
    │   │       │           ├── run_350m.sh
    │   │       │           └── sweep
    │   │       │               ├── README.md
    │   │       │               ├── run_single.sh
    │   │       │               └── run_step2_sweep.sh
    │   │   ├── step2_reward_model_finetuning
    │   │       ├── README.md
    │   │       ├── evaluation_scripts
    │   │       │   └── run_eval.sh
    │   │       ├── main.py
    │   │       ├── rw_eval.py
    │   │       ├── training_log_output
    │   │       │   └── opt-350m_globalBatchSize-64.log
    │   │       └── training_scripts
    │   │       │   ├── README.md
    │   │       │   ├── llama2
    │   │       │       ├── run_llama2_7b.sh
    │   │       │       └── run_llama2_7b_lora.sh
    │   │       │   └── opt
    │   │       │       ├── multi_node
    │   │       │           └── run_350m.sh
    │   │       │       ├── single_gpu
    │   │       │           └── run_350m.sh
    │   │       │       └── single_node
    │   │       │           ├── run_350m.sh
    │   │       │           └── sweep
    │   │       │               ├── README.md
    │   │       │               ├── run_single.sh
    │   │       │               └── run_step2_sweep.sh
    │   │   └── step3_rlhf_finetuning
    │   │       ├── BenckmarkSetting.md
    │   │       ├── README.md
    │   │       ├── main.py
    │   │       ├── training_log_output
    │   │           └── actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log
    │   │       └── training_scripts
    │   │           ├── README.md
    │   │           ├── llama2
    │   │               ├── run_llama2_7b.sh
    │   │               ├── run_llama2_7b_lora.sh
    │   │               └── run_llama2_7b_mixz.sh
    │   │           └── opt
    │   │               ├── multi_node
    │   │                   └── run_66b.sh
    │   │               ├── single_gpu
    │   │                   ├── run_1.3b.sh
    │   │                   └── run_6.7b_lora.sh
    │   │               └── single_node
    │   │                   ├── run_1.3b.sh
    │   │                   ├── run_1.3b_lora.sh
    │   │                   ├── run_13b.sh
    │   │                   ├── run_30b_lora.sh
    │   │                   ├── run_6.7b.sh
    │   │                   └── sweep
    │   │                       ├── README.md
    │   │                       ├── run_single.sh
    │   │                       └── run_step3_sweep.sh
    └── DeepSpeed-VisualChat
    │   ├── README.md
    │   ├── assets
    │       ├── banner.png
    │       ├── ceos.png
    │       ├── friends.png
    │       ├── hero-figure.png
    │       └── model.png
    │   ├── chat
    │       ├── README.md
    │       ├── chat.py
    │       └── chat_scripts
    │       │   └── run.sh
    │   ├── eval
    │       ├── README.md
    │       ├── batch_generation.py
    │       ├── eval_data
    │       │   ├── eval_comprehensive.json
    │       │   ├── eval_robustness.json
    │       │   ├── eval_single.json
    │       │   └── images
    │       │   │   ├── cats
    │       │   │       ├── 1806905748_adb926a0a0.jpg
    │       │   │       ├── british_shorthair.jpg
    │       │   │       └── cat.png
    │       │   │   ├── friends
    │       │   │       ├── can-count1.jpg
    │       │   │       ├── can-count2.jpg
    │       │   │       ├── wrong-count1.jpg
    │       │   │       └── wrong-count2.jpg
    │       │   │   ├── singles
    │       │   │       ├── 1.jpg
    │       │   │       ├── 2.jpg
    │       │   │       ├── 202160027_b319c4166e.jpg
    │       │   │       ├── 50.jpg
    │       │   │       ├── extreme_ironing.jpg
    │       │   │       └── waterview.jpg
    │       │   │   ├── tech-ceo
    │       │   │       ├── gate1.jpg
    │       │   │       ├── jobs1.jpg
    │       │   │       └── musk1.jpg
    │       │   │   └── zootopia
    │       │   │       ├── z1.png
    │       │   │       ├── z2.png
    │       │   │       ├── z2a.png
    │       │   │       └── z3.png
    │       ├── eval_scripts
    │       │   └── run_batch.sh
    │       └── results
    │       │   ├── eval_comprehensive
    │       │       ├── ours-set1_best_eval.csv
    │       │       ├── ours-set1_final.csv
    │       │       ├── ours-set2_best_eval.csv
    │       │       └── ours-set2_final.csv
    │       │   ├── eval_robustness
    │       │       ├── ours-set1_best_eval.csv
    │       │       ├── ours-set1_final.csv
    │       │       ├── ours-set2_best_eval.csv
    │       │       └── ours-set2_final.csv
    │       │   └── eval_single
    │       │       ├── ours-single_best_eval.csv
    │       │       └── ours-single_final.csv
    │   ├── helper
    │       ├── README.md
    │       ├── extract_qwen_vl.py
    │       └── qwen_clip
    │       │   ├── config.json
    │       │   └── preprocessor_config.json
    │   ├── requirements.txt
    │   ├── training
    │       ├── README.md
    │       ├── main.py
    │       └── training_scripts
    │       │   └── run_7b.sh
    │   └── utils
    │       ├── data
    │           ├── DST.py
    │           ├── __init__.py
    │           ├── aokvqa_dataset.py
    │           ├── builder.py
    │           ├── cc_sbu_align_dataset.py
    │           ├── coco_caption_dataset.py
    │           ├── dial_dataset.py
    │           ├── llava_dataset.py
    │           ├── llava_otter_blend_dataset.py
    │           ├── ocr_vqa_dataset.py
    │           ├── otter_mimicit_cgd_dataset.py
    │           ├── otter_mimicit_sd_dataset.py
    │           ├── otter_mimicit_sn_dataset.py
    │           ├── otter_mimicit_tvc_dataset.py
    │           ├── otter_mimicit_vst_dataset.py
    │           ├── sparkles_dialogue_dataset.py
    │           ├── utils.py
    │           └── vqa_dataset.py
    │       ├── ds_utils.py
    │       ├── model
    │           ├── __init__.py
    │           ├── modeling_dsvl.py
    │           ├── third_party_model
    │           │   ├── hf_model
    │           │   │   ├── configuration_llama.py
    │           │   │   └── modeling_llama.py
    │           │   └── qwen_clip
    │           │   │   └── qwen_clip.py
    │           └── vis_proj.py
    │       ├── module
    │           └── lora.py
    │       └── utils.py
├── benchmarks
    ├── README.md
    ├── communication
    │   ├── README.md
    │   ├── __init__.py
    │   ├── all_gather.py
    │   ├── all_reduce.py
    │   ├── all_to_all.py
    │   ├── broadcast.py
    │   ├── constants.py
    │   ├── pt2pt.py
    │   ├── run_all.py
    │   └── utils.py
    ├── deepcompile
    │   ├── .gitignore
    │   ├── README.md
    │   ├── configs
    │   │   ├── ddp_config.yaml.template
    │   │   ├── ds_config.json.template
    │   │   ├── ds_config.yaml.template
    │   │   ├── fsdp_config.yaml.template
    │   │   └── singlegpu_config.yaml.template
    │   ├── gen_chart_acc_steps.py
    │   ├── generate_conf.py
    │   ├── hostfile_n4
    │   ├── plot.py
    │   ├── plot_common.py
    │   ├── results
    │   │   ├── acc_step_1
    │   │   │   └── throughput
    │   │   │   │   ├── chart_throughput_Llama-3-70B_np32_bs1.png
    │   │   │   │   ├── chart_throughput_Llama-3-70B_np32_bs2.png
    │   │   │   │   ├── chart_throughput_Llama-3-70B_np32_bs4.png
    │   │   │   │   ├── chart_throughput_Mixtral-8x7B_np32_bs1.png
    │   │   │   │   ├── chart_throughput_Mixtral-8x7B_np32_bs2.png
    │   │   │   │   └── chart_throughput_Mixtral-8x7B_np32_bs4.png
    │   │   └── acc_step_1_16
    │   │   │   └── throughput
    │   │   │       ├── chart_throughput_Llama-3-70B_np32_bs1.png
    │   │   │       └── chart_throughput_Mixtral-8x7B_np32_bs1.png
    │   ├── run.sh
    │   ├── run_bench.sh
    │   ├── run_bench_acc.sh
    │   ├── run_bench_lm.py
    │   ├── run_bench_offload.sh
    │   ├── run_bench_z1.sh
    │   └── run_multinode.sh
    └── inference
    │   ├── README.md
    │   ├── bert-bench.py
    │   ├── collect_results.py
    │   ├── deepspeedometer
    │       ├── README.md
    │       ├── configs
    │       │   ├── 128k-120.yaml
    │       │   ├── 1300-120.yaml
    │       │   ├── 2600-60.yaml
    │       │   └── 500-500.yaml
    │       ├── pyproject.toml
    │       ├── run_example.sh
    │       ├── src
    │       │   └── deepspeedometer
    │       │   │   ├── __init__.py
    │       │   │   ├── arg_parsing.py
    │       │   │   ├── benchmark_runner.py
    │       │   │   ├── clients
    │       │   │       ├── __init__.py
    │       │   │       ├── azure_ml_client.py
    │       │   │       ├── base.py
    │       │   │       ├── dummy_client.py
    │       │   │       ├── fastgen_client.py
    │       │   │       ├── openai_client.py
    │       │   │       └── vllm_client.py
    │       │   │   ├── config.py
    │       │   │   ├── prompt.py
    │       │   │   ├── response.py
    │       │   │   └── sample_input.py
    │       └── tests
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── conftest.py
    │       │   ├── test_benchmark.py
    │       │   ├── test_config.py
    │       │   ├── test_early_stop.py
    │       │   └── test_prompt.py
    │   ├── gpt-bench.py
    │   ├── mii
    │       ├── A6000_benchmarks_example.PNG
    │       ├── README.md
    │       ├── plot_config.yaml
    │       ├── requirements.txt
    │       ├── run_all.sh
    │       ├── run_aml.sh
    │       ├── run_benchmark.py
    │       ├── run_example.sh
    │       ├── run_fp6.sh
    │       └── src
    │       │   ├── __init__.py
    │       │   ├── client.py
    │       │   ├── defaults.py
    │       │   ├── plot_effective_throughput.py
    │       │   ├── plot_latency_percentile.py
    │       │   ├── plot_repl_scale.py
    │       │   ├── plot_th_lat.py
    │       │   ├── plot_tp_sizes.py
    │       │   ├── postprocess_results.py
    │       │   ├── random_query_generator.py
    │       │   ├── sample_input.py
    │       │   ├── server.py
    │       │   └── utils.py
    │   ├── requirements.txt
    │   ├── run_model.sh
    │   ├── run_triton_benchmark.sh
    │   ├── sweep.sh
    │   └── triton-bert-benchmark.py
├── compression
    ├── README.md
    ├── bert
    │   ├── README.md
    │   ├── bash_script
    │   │   ├── XTC
    │   │   │   ├── layer_reduction.sh
    │   │   │   ├── layer_reduction_1bit.sh
    │   │   │   └── quant_1bit.sh
    │   │   ├── ZeroQuant
    │   │   │   ├── zero_quant.sh
    │   │   │   └── zero_quant_lkd.sh
    │   │   ├── layer_reduction.sh
    │   │   ├── pruning_head.sh
    │   │   ├── pruning_row.sh
    │   │   ├── pruning_sparse.sh
    │   │   ├── pruning_sparse_snip_momentum.sh
    │   │   ├── quant_activation.sh
    │   │   └── quant_weight.sh
    │   ├── config
    │   │   ├── XTC
    │   │   │   ├── ds_config_W1A8_Qgroup1_fp32.json
    │   │   │   ├── ds_config_layer_reduction_W1Q8_fp32.json
    │   │   │   └── ds_config_layer_reduction_fp16.json
    │   │   ├── ZeroQuant
    │   │   │   ├── ds_config_W48A8_Qgroup48_lkd_fp32.json
    │   │   │   └── ds_config_W8A8_Qgroup48_fp32.json
    │   │   ├── ds_config.json
    │   │   ├── ds_config_TEMPLATE.json
    │   │   ├── ds_config_W1A8_Qgroup64_fp16.json
    │   │   ├── ds_config_W1A8_Qgroup64_fp32.json
    │   │   ├── ds_config_W1or2A8_Qgroup64_fp16.json
    │   │   └── ds_config_structural_pruning_TEMPLATE.json
    │   ├── huggingface_transformer
    │   │   └── modeling_bert.py
    │   ├── requirements.txt
    │   ├── run_glue_lkd.py
    │   ├── run_glue_no_trainer.py
    │   └── util.py
    ├── cifar
    │   ├── README.md
    │   ├── config
    │   │   ├── ds_config.json
    │   │   └── ds_config_channel_prune.json
    │   ├── resnet.py
    │   ├── run_compress.sh
    │   ├── train.py
    │   └── utils.py
    └── gpt2
    │   ├── README.md
    │   ├── bash_script
    │       └── run_zero_quant.sh
    │   ├── config
    │       ├── ds_config.json
    │       ├── ds_config_W4or8A8_Qgroup64_fp16.json
    │       ├── ds_config_W4or8A8_Qgroup64_fp32.json
    │       ├── ds_config_W8A8_Qgroup64_fp16.json
    │       └── ds_config_W8A8_Qgroup64_fp32.json
    │   ├── requirements.txt
    │   └── run_clm_no_trainer.py
├── deepnvme
    ├── file_access
    │   ├── README.md
    │   ├── aio_load_cpu_tensor.py
    │   ├── aio_load_gpu_tensor.py
    │   ├── aio_store_cpu_tensor.py
    │   ├── aio_store_gpu_tensor.py
    │   ├── gds_load_gpu_tensor.py
    │   ├── gds_store_gpu_tensor.py
    │   ├── media
    │   │   └── deepnvme_ops_report.png
    │   ├── py_load_cpu_tensor.py
    │   ├── py_load_gpu_tensor.py
    │   ├── py_store_cpu_tensor.py
    │   ├── py_store_gpu_tensor.py
    │   ├── run_load_tensor.sh
    │   ├── run_store_tensor.sh
    │   └── utils.py
    ├── model_checkpoint
    │   ├── README.md
    │   ├── deepspeed_save_model.py
    │   ├── requirements.txt
    │   ├── save_model_utils.py
    │   ├── torch
    │   │   ├── serialization_fast_v2.6.0.py
    │   │   └── serialization_orig_v2.6.0.py
    │   ├── torch_save_model.py
    │   ├── torch_save_tensor.py
    │   └── torch_save_utils.py
    └── zero_inference
    │   ├── README.md
    │   └── media
    │       ├── nvme_config.png
    │       ├── zero_inf_mem_use_cpu.png
    │       └── zero_inf_mem_use_gds.png
├── evaluation
    └── inference
    │   └── human_eval
    │       ├── README.md
    │       └── run_human_eval.py
├── inference
    ├── huggingface
    │   ├── README.md
    │   ├── automatic-speech-recognition
    │   │   ├── README.md
    │   │   ├── requirements.txt
    │   │   └── test-wav2vec2.py
    │   ├── fill-mask
    │   │   ├── README.md
    │   │   ├── requirements.txt
    │   │   ├── test-bert.py
    │   │   ├── test-electra.py
    │   │   └── test-roberta.py
    │   ├── stable-diffusion
    │   │   ├── README.md
    │   │   ├── local_pipeline_stable_diffusion.py
    │   │   ├── requirements.txt
    │   │   └── test-stable-diffusion.py
    │   ├── text-generation
    │   │   ├── README.md
    │   │   ├── arguments.py
    │   │   ├── ds-hf-compare.py
    │   │   ├── inference-test.py
    │   │   ├── requirements.txt
    │   │   ├── run-generation-script
    │   │   │   ├── README.md
    │   │   │   ├── requirements.txt
    │   │   │   ├── sample_query.txt
    │   │   │   ├── single_query.txt
    │   │   │   ├── test-gpt.sh
    │   │   │   └── test-run-generation.py
    │   │   └── utils.py
    │   ├── text2text-generation
    │   │   ├── README.md
    │   │   ├── requirements.txt
    │   │   └── test-t5.py
    │   ├── translation
    │   │   ├── README.md
    │   │   ├── requirements.txt
    │   │   └── test-t5-base.py
    │   └── zero_inference
    │   │   ├── README.md
    │   │   ├── images
    │   │       └── over_v1.png
    │   │   ├── model-support.md
    │   │   ├── requirements.txt
    │   │   ├── run_bloom175b_a6000.sh
    │   │   ├── run_llama2_70b_a6000.sh
    │   │   ├── run_model.py
    │   │   ├── run_model.sh
    │   │   ├── run_opt175b_a6000.sh
    │   │   ├── run_opt1p3b_a6000.sh
    │   │   ├── run_opt30b_a6000.sh
    │   │   ├── run_opt66b_a6000.sh
    │   │   ├── timer.py
    │   │   └── utils.py
    ├── mii
    │   ├── README.md
    │   ├── non-persistent
    │   │   ├── README.md
    │   │   ├── falcon.py
    │   │   ├── llama2.py
    │   │   ├── mixtral.py
    │   │   └── pipeline.py
    │   ├── persistent
    │   │   ├── README.md
    │   │   ├── client.py
    │   │   ├── serve.py
    │   │   └── terminate.py
    │   └── requirements.txt
    └── sglang
    │   ├── README.md
    │   ├── ds_offload_cpu.json
    │   ├── ds_offload_nvme_aio.json
    │   ├── ds_offload_nvme_gds.json
    │   ├── run_llama3_1B.sh
    │   ├── run_llama3_70B.sh
    │   └── run_llama3_8B.sh
├── scripts
    └── check-license.py
└── training
    ├── BingBertGlue
        ├── glue_bert_base.json
        ├── glue_bert_large.json
        ├── nvidia
        │   ├── modeling.py
        │   ├── modelingpreln.py
        │   └── modelingpreln_layerdrop.py
        ├── nvidia_bert_dataset_provider.py
        ├── pytorch_pretrained_bert
        │   ├── __init__.py
        │   ├── __main__.py
        │   ├── convert_tf_checkpoint_to_pytorch.py
        │   ├── file_utils.py
        │   ├── modeling.py
        │   ├── optimization.py
        │   └── tokenization.py
        ├── run_glue_bert_base_finetune.sh
        ├── run_glue_bert_large_finetune.sh
        ├── run_glue_classifier_bert_base.py
        ├── run_glue_classifier_bert_large.py
        └── turing
        │   ├── dataset.py
        │   ├── file_utils.py
        │   ├── logger.py
        │   ├── loss.py
        │   ├── models.py
        │   ├── sources.py
        │   ├── text.py
        │   └── utils.py
    ├── BingBertSquad
        ├── 1-bit_adam
        │   ├── mpi_ethernet
        │   │   ├── deepspeed_onebitadam_bsz96_config.json
        │   │   ├── run_squad_deepspeed_onebitadam.sh
        │   │   └── run_squad_mpi_onebitadam.sh
        │   ├── mpi_infiniband
        │   │   ├── deepspeed_onebitadam_bsz96_config.json
        │   │   ├── run_squad_deepspeed_onebitadam.sh
        │   │   └── run_squad_mpi_onebitadam.sh
        │   └── nccl
        │   │   ├── deepspeed_onebitadam_bsz96_config.json
        │   │   └── run_squad_deepspeed_onebitadam.sh
        ├── NOTICE.txt
        ├── ckpt
        │   └── bert-large-uncased-whole-word-masking-config.json
        ├── convert_bert_ckpt_to_deepspeed.py
        ├── deepspeed_bsz24_config.json
        ├── evaluate-v1.1.py
        ├── evaluate.py
        ├── nvidia_run_squad_baseline.py
        ├── nvidia_run_squad_deepspeed.py
        ├── pytorch_pretrained_bert
        │   ├── __init__.py
        │   ├── file_utils.py
        │   ├── modeling.py
        │   ├── optimization.py
        │   └── tokenization.py
        ├── run_hf.sh
        ├── run_squad_baseline.sh
        ├── run_squad_deepspeed.sh
        ├── turing
        │   ├── file_utils.py
        │   ├── loss.py
        │   ├── modelingpreln_layerdrop.py
        │   ├── nvidia_modeling.py
        │   └── nvidia_modelingpreln.py
        └── utils.py
    ├── DeepSpeed-Domino
        ├── README.md
        ├── domino
        │   ├── gpt_model.py
        │   ├── language_model.py
        │   └── training.py
        ├── pretrain_gpt.py
        ├── pretrain_gpt3_13b.sh
        ├── pretrain_gpt3_6.7b.sh
        └── requirements.txt
    ├── HelloDeepSpeed
        ├── README.md
        ├── requirements.txt
        ├── run.sh
        ├── run_ds.sh
        ├── tests
        │   ├── __init__.py
        │   └── test_train_bert.py
        ├── train_bert.py
        └── train_bert_ds.py
    ├── MoQ
        ├── README.md
        ├── huggingface-transformers
        │   └── examples
        │   │   └── research_projects
        │   │       └── lxmert
        │   │           └── requirements.txt
        ├── requirements.txt
        ├── run.sh
        ├── run_glue.py
        └── test.json
    ├── autotuning
        ├── .gitignore
        ├── README.md
        └── hf
        │   ├── README.md
        │   ├── bert-base
        │       ├── README.md
        │       ├── ds_config_tune.json
        │       └── test_tune.sh
        │   ├── bert-large
        │       ├── README.md
        │       ├── ds_config_tune.json
        │       └── test_tune.sh
        │   ├── deberta
        │       ├── README.md
        │       ├── ds_config_fp16_tune.json
        │       └── test_tune.sh
        │   ├── distilbert
        │       ├── README.md
        │       ├── ds_config_tune.json
        │       └── test_tune.sh
        │   ├── dsconfigs
        │       ├── ds_config_fp16_tune.json
        │       ├── ds_config_fp16_z0.json
        │       ├── ds_config_fp16_z1.json
        │       ├── ds_config_fp16_z2.json
        │       ├── ds_config_fp16_z3.json
        │       ├── ds_config_tune.json
        │       ├── ds_config_z0.json
        │       ├── ds_config_z1.json
        │       ├── ds_config_z2.json
        │       └── ds_config_z3.json
        │   ├── gpt2-large
        │       ├── README.md
        │       └── test_tune.sh
        │   ├── gpt2-medium
        │       ├── README.md
        │       └── test_tune.sh
        │   ├── gpt2-xl
        │       ├── README.md
        │       └── test_tune.sh
        │   └── gpt2
        │       ├── README.md
        │       └── test_tune.sh
    ├── bing_bert
        ├── 01_adam
        │   ├── mpi_ethernet
        │   │   ├── deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json
        │   │   ├── deepspeed_bsz4k_01adam_config_seq512_mpi_ethernet.json
        │   │   ├── ds_train_bert_01adam_bsz4k_seq128_mpi_ethernet.sh
        │   │   └── ds_train_bert_01adam_bsz4k_seq512_mpi_ethernet.sh
        │   ├── mpi_infiniband
        │   │   ├── deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json
        │   │   ├── deepspeed_bsz4k_01adam_config_seq512_mpi_infiniband.json
        │   │   ├── ds_train_bert_01adam_bsz4k_seq128_mpi_infiniband.sh
        │   │   └── ds_train_bert_01adam_bsz4k_seq512_mpi_infiniband.sh
        │   └── nccl
        │   │   ├── deepspeed_bsz4k_01adam_config_seq128_nccl.json
        │   │   ├── deepspeed_bsz4k_01adam_config_seq512_nccl.json
        │   │   ├── ds_train_bert_01adam_bsz4k_seq128_nccl.sh
        │   │   └── ds_train_bert_01adam_bsz4k_seq512_nccl.sh
        ├── 1-bit_adam
        │   ├── mpi_ethernet
        │   │   ├── deepspeed_bsz4k_onebitadam_config_seq128_mpi_ethernet.json
        │   │   ├── ds_train_bert_onebitadam_bsz4k_seq128_mpi_ethernet.sh
        │   │   └── mpi_train_bert_onebitadam_bsz4k_seq128_ethernet.sh
        │   ├── mpi_infiniband
        │   │   ├── deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json
        │   │   ├── ds_train_bert_onebitadam_bsz4k_seq128_mpi_infiniband.sh
        │   │   └── mpi_train_bert_onebitadam_bsz4k_seq128_infiniband.sh
        │   └── nccl
        │   │   ├── deepspeed_bsz4k_onebitadam_config_seq128_nccl.json
        │   │   └── ds_train_bert_onebitadam_bsz4k_seq128_nccl.sh
        ├── 1-bit_lamb
        │   ├── mpi_ethernet
        │   │   ├── deepspeed_bsz32k_onebitlamb_config_seq512_mpi_ethernet.json
        │   │   ├── deepspeed_bsz64k_onebitlamb_config_seq128_mpi_ethernet.json
        │   │   ├── ds_train_bert_onebitlamb_bsz32k_seq512_mpi_ethernet.sh
        │   │   ├── ds_train_bert_onebitlamb_bsz64k_seq128_mpi_ethernet.sh
        │   │   ├── mpi_train_bert_onebitlamb_bsz32k_seq512_ethernet.sh
        │   │   └── mpi_train_bert_onebitlamb_bsz64k_seq128_ethernet.sh
        │   ├── mpi_infiniband
        │   │   ├── deepspeed_bsz32k_onebitlamb_config_seq512_mpi_infiniband.json
        │   │   ├── deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json
        │   │   ├── ds_train_bert_onebitlamb_bsz32k_seq512_mpi_infiniband.sh
        │   │   ├── ds_train_bert_onebitlamb_bsz64k_seq128_mpi_infiniband.sh
        │   │   ├── mpi_train_bert_onebitlamb_bsz32k_seq512_infiniband.sh
        │   │   └── mpi_train_bert_onebitlamb_bsz64k_seq128_infiniband.sh
        │   └── nccl
        │   │   ├── deepspeed_bsz32k_onebitlamb_config_seq512_nccl.json
        │   │   ├── deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json
        │   │   ├── ds_train_bert_onebitlamb_bsz32k_seq512_nccl.sh
        │   │   └── ds_train_bert_onebitlamb_bsz64k_seq128_nccl.sh
        ├── NOTICE.txt
        ├── README.md
        ├── bert_base.json
        ├── bert_base_large_lr.json
        ├── bert_dataset_provider.py
        ├── bert_large.json
        ├── bert_large_lamb.json
        ├── bert_large_lamb_nvidia_data.json
        ├── bing_bert_dataset_provider.py
        ├── data_worker.py
        ├── deepspeed_bsz32k_lamb_config_seq512.json
        ├── deepspeed_bsz4k_progressive_layer_drop_config_seq128.json
        ├── deepspeed_bsz64k_lamb_config_seq128.json
        ├── deepspeed_train.py
        ├── ds_sa_train_bert_bsz64k_seq128.sh
        ├── ds_train_bert_bsz32k_seq512.sh
        ├── ds_train_bert_bsz64k_seq128.sh
        ├── ds_train_bert_nvidia_data_bsz32k_seq512.sh
        ├── ds_train_bert_nvidia_data_bsz64k_seq128.sh
        ├── ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh
        ├── glue_bert_base.json
        ├── glue_bert_large.json
        ├── nvidia
        │   ├── modelingpreln.py
        │   └── modelingpreln_layerdrop.py
        ├── nvidia_bert_dataset_provider.py
        ├── pytorch_pretrained_bert
        │   ├── __init__.py
        │   ├── __main__.py
        │   ├── convert_tf_checkpoint_to_pytorch.py
        │   ├── file_utils.py
        │   ├── modeling.py
        │   ├── optimization.py
        │   └── tokenization.py
        ├── requirements.txt
        ├── run_glue_bert_base_finetune.sh
        ├── run_glue_bert_large_finetune.sh
        ├── run_glue_classifier_bert_base.py
        ├── run_glue_classifier_bert_large.py
        ├── timer.py
        ├── turing
        │   ├── dataset.py
        │   ├── file_utils.py
        │   ├── logger.py
        │   ├── loss.py
        │   ├── models.py
        │   ├── sources.py
        │   ├── text.py
        │   └── utils.py
        └── utils.py
    ├── cifar
        ├── LICENSE
        ├── NOTICE.txt
        ├── README.md
        ├── cifar10_deepspeed.py
        ├── cifar10_tutorial.py
        ├── requirements.txt
        ├── run_ds.sh
        ├── run_ds_moe.sh
        └── run_ds_prmoe.sh
    ├── data_efficiency
        ├── gpt_finetuning
        │   ├── README.md
        │   ├── analyze_data.py
        │   ├── bash_script
        │   │   ├── run_base_random_ltd.sh
        │   │   └── run_medium_random_ltd.sh
        │   ├── config
        │   │   ├── ds_config_gpt_base_random_ltd.json
        │   │   └── ds_config_gpt_medium_random_ltd.json
        │   ├── finetune
        │   │   ├── ds_analyze_gpt_data_map.sh
        │   │   ├── ds_analyze_gpt_data_reduce.sh
        │   │   ├── ds_config_gpt2-medium_1clmetric_TEMPLATE.json
        │   │   ├── ds_config_gpt2-medium_2clmetrics_TEMPLATE.json
        │   │   ├── ds_config_gpt2_TEMPLATE.json
        │   │   ├── ds_finetune_gpt2.sh
        │   │   └── ds_finetune_gpt2_run.sh
        │   ├── learning_rates.py
        │   ├── requirement.txt
        │   └── run_clm_no_trainer.py
        ├── variable_batch_size_and_lr
        │   ├── README.md
        │   ├── variable_attn_matrix.png
        │   ├── variable_batch_lr.png
        │   ├── variable_batch_lr_pipeline.png
        │   └── variable_batch_size_and_lr_example.py
        └── vit_finetuning
        │   ├── README.md
        │   ├── bash_script
        │       ├── run_cifar_random_ltd.sh
        │       └── run_imagenet_random_ltd.sh
        │   ├── config
        │       ├── ds_config_cifar_random_ltd.json
        │       └── ds_config_imagenet_random_ltd.json
        │   ├── main_cifar.py
        │   ├── main_imagenet.py
        │   ├── models
        │       ├── __init__.py
        │       └── vit.py
        │   ├── requirement.txt
        │   └── utils
        │       ├── __init__.py
        │       ├── get_data.py
        │       └── utils.py
    ├── gan
        ├── gan_baseline_run.sh
        ├── gan_baseline_train.py
        ├── gan_deepspeed_config.json
        ├── gan_deepspeed_run.sh
        ├── gan_deepspeed_train.py
        ├── gan_model.py
        └── utils.py
    ├── imagenet
        ├── README.md
        ├── assets
        │   └── resnetplot.png
        ├── config
        │   ├── ds_config.json
        │   ├── ds_fp16_config.json
        │   └── ds_fp16_z1_config.json
        ├── extract_ILSVRC.sh
        ├── main.py
        ├── requirements.txt
        ├── run_ds.sh
        ├── run_ds_fp16.sh
        └── run_ds_fp16_z1.sh
    ├── megatron
        └── README.md
    ├── offload_states
        ├── README.md
        ├── offload_states.py
        ├── output_table.py
        └── run_benchmark.sh
    ├── pipeline_parallelism
        ├── alexnet.py
        ├── ds_config.json
        ├── run.sh
        └── train.py
    ├── stable_diffusion
        ├── README.md
        ├── inf_txt2img_loop.py
        ├── local_pipeline_stable_diffusion.py
        ├── mytrainbash.sh
        ├── requirements.txt
        └── train_sd_distil_lora.py
    └── tensor_parallel
        ├── README.md
        ├── alpaca_data.json
        ├── configs
            ├── ds_config.json
            └── ds_config_temp.json
        ├── requirements.txt
        ├── run.sh
        ├── train.py
        ├── train_bench_length.py
        └── utils.py


/.github/workflows/formatting.yml:
--------------------------------------------------------------------------------
 1 | name: Formatting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'staging**'
 7 |   pull_request:
 8 |     branches:
 9 |       '**'
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 | 
17 |   # formatting and basic install on cpu-only machine
18 |   formatting:
19 |     runs-on: ubuntu-22.04
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v2
23 | 
24 |       - name: environment
25 |         run: |
26 |           which python
27 |           python --version
28 |           pip install pre-commit>=2.20.0
29 | 
30 |       - name: Formatting checks
31 |         run: |
32 |            pre-commit run --all-files
33 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "training/DeepSpeed-Domino/Megatron-LM"]
2 | 	path = training/DeepSpeed-Domino/Megatron-LM
3 | 	url = https://github.com/NVIDIA/Megatron-LM.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v1.2.3
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |         exclude: "Megatron-LM/"
 7 |         files: ^applications/DeepSpeed-Chat/.+
 8 |     -   id: check-yaml
 9 |         exclude: "Megatron-LM/"
10 |         files: ^applications/DeepSpeed-Chat/.+
11 |     -   id: end-of-file-fixer
12 |         exclude: "Megatron-LM/"
13 |         files: ^applications/DeepSpeed-Chat/.+
14 | 
15 | 
16 | -   repo: https://github.com/google/yapf
17 |     rev: v0.32.0
18 |     hooks:
19 |     -   id: yapf
20 |         files: ^applications/DeepSpeed-Chat/.+
21 | 
22 | -   repo: https://github.com/pycqa/flake8
23 |     rev: 4.0.1
24 |     hooks:
25 |     -   id: flake8
26 |         args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
27 |         files: ^applications/DeepSpeed-Chat/.+
28 | 
29 | -   repo: local
30 |     hooks:
31 |     -   id: check-license
32 |         name: check-license
33 |         entry: ./scripts/check-license.py
34 |         language: script
35 |         files: ^applications/DeepSpeed-Chat/.+\.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr|sh)$
36 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @tjruwase @ShadenSmith @awan-10 @minjiaz
2 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/1.3B-breakdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/1.3B-breakdown.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/Banner-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/Banner-benchmark.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/RLHF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/RLHF.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/democrat2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/democrat2.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ds-chat-single.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-chat-single.gif


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ds-chat.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-chat.gif


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ds-shiba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ds-shiba.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/e2e_RLHF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/e2e_RLHF.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/four_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/four_blocks.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/ppo_trainer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/ppo_trainer.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/reward_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/reward_function.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/image/shiba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/image/shiba.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/assets/video/release_v3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-Chat/assets/video/release_v3.mp4


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/chat.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import argparse
 7 | import subprocess
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--path",
12 |                         type=str,
13 |                         help="Directory containing trained actor model")
14 |     parser.add_argument(
15 |         "--max_new_tokens",
16 |         type=int,
17 |         default=128,
18 |         help="Maximum new tokens to generate per response",
19 |     )
20 |     args = parser.parse_args()
21 | 
22 |     cmd = f"python3 ./inference/chatbot.py --path {args.path} --max_new_tokens {args.max_new_tokens}"
23 |     p = subprocess.Popen(cmd, shell=True)
24 |     p.wait()
25 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=2.8.0
2 | sentencepiece>=0.1.97
3 | protobuf==3.20.3
4 | accelerate>=0.15.0
5 | torch>=1.12.0
6 | deepspeed>=0.9.0
7 | transformers>=4.31.0,!=4.33.2
8 | tensorboard
9 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # setup.py: install script for deepspeed_chat
 8 | """
 9 | to install deepspeed_chat and its dependencies for development work,
10 | run this cmd from the root directory:
11 |     pip install -e .
12 | """
13 | import setuptools
14 | 
15 | setuptools.setup(
16 |     name="deepspeed-chat",
17 |     version="0.1",
18 |     url=
19 |     "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat",
20 |     include_package_data=True,
21 |     packages=setuptools.find_packages(include=['dschat']),
22 |     install_requires=[
23 |         "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3",
24 |         "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2",
25 |         "transformers>=4.31.0,!=4.33.2", "tensorboard"
26 |     ],
27 |     extras_require={
28 |         "azureml": [
29 |             "azure-ml-component",
30 |             "azureml-core",
31 |         ],
32 |     })
33 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # You can provide two models to compare the performance of the baseline and the finetuned model
 8 | export CUDA_VISIBLE_DEVICES=0
 9 | python prompt_eval.py \
10 |     --model_name_or_path_baseline XXX \
11 |     --model_name_or_path_finetune XXX
12 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 | 
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
4 | `` --model_name_or_path facebook/opt-1.3b`` to ``--model_name_or_path EleutherAI/gpt-j-6b ``.
5 | 
6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
7 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output_step1_llama2_7b
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
20 |    --per_device_train_batch_size 4 \
21 |    --per_device_eval_batch_size 4 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0. \
25 |    --num_train_epochs 4  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --output_dir $OUTPUT \
34 |    &> $OUTPUT/training.log
35 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output_step1_llama2_7b_lora
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
20 |    --per_device_train_batch_size 4 \
21 |    --per_device_eval_batch_size 4 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0. \
25 |    --num_train_epochs 4  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --lora_dim 128 \
34 |    --lora_module_name "layers." \
35 |    --output_dir $OUTPUT \
36 |    &> $OUTPUT/training.log
37 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/multi_node/run_66b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-66b \
20 |    --per_device_train_batch_size 4 \
21 |    --per_device_eval_batch_size 4 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 1e-4 \
24 |    --weight_decay 0.1 \
25 |    --num_train_epochs 2  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --lora_dim 128 \
33 |    --lora_module_name decoder.layers. \
34 |    --deepspeed \
35 |    --output_dir $OUTPUT \
36 |    &> $OUTPUT/training.log
37 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # Note that usually LoRA needs to use larger learning rate
 8 | OUTPUT=$1
 9 | ZERO_STAGE=$2
10 | if [ "$OUTPUT" == "" ]; then
11 |     OUTPUT=./output
12 | fi
13 | if [ "$ZERO_STAGE" == "" ]; then
14 |     ZERO_STAGE=0
15 | fi
16 | mkdir -p $OUTPUT
17 | 
18 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-1.3b \
19 |    --gradient_accumulation_steps 8 --lora_dim 128 --zero_stage $ZERO_STAGE \
20 |    --enable_tensorboard \
21 |    --tensorboard_path $OUTPUT \
22 |    --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
23 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # Note that usually LoRA needs to use larger learning rate
 8 | OUTPUT_PATH=./output
 9 | mkdir -p $OUTPUT_PATH
10 | 
11 | deepspeed --num_gpus 1 main.py \
12 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
13 |    --data_split 2,4,4 \
14 |    --model_name_or_path facebook/opt-6.7b \
15 |    --per_device_train_batch_size 8 \
16 |    --per_device_eval_batch_size 8 \
17 |    --max_seq_len 512 \
18 |    --learning_rate 1e-3 \
19 |    --weight_decay 0. \
20 |    --num_train_epochs 16 \
21 |    --gradient_accumulation_steps 16 \
22 |    --lr_scheduler_type cosine \
23 |    --num_warmup_steps 0 \
24 |    --seed 1234 \
25 |    --gradient_checkpointing \
26 |    --zero_stage 0 \
27 |    --lora_dim 128 \
28 |    --lora_module_name decoder.layers. \
29 |    --deepspeed \
30 |    --output_dir $OUTPUT_PATH \
31 |    &> $OUTPUT_PATH/training.log
32 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=2
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-1.3b \
20 |    --per_device_train_batch_size 8 \
21 |    --per_device_eval_batch_size 8 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0. \
25 |    --num_train_epochs 16 \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --zero_stage $ZERO_STAGE \
31 |    --deepspeed \
32 |    --enable_tensorboard \
33 |    --tensorboard_path $OUTPUT \
34 |    --output_dir $OUTPUT \
35 |    &> $OUTPUT/training.log
36 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # Note that usually LoRA needs to use larger learning rate
 8 | OUTPUT_PATH=./output
 9 | mkdir -p $OUTPUT_PATH
10 | 
11 | deepspeed main.py \
12 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
13 |    --data_split 2,4,4 \
14 |    --model_name_or_path facebook/opt-1.3b \
15 |    --per_device_train_batch_size 8 \
16 |    --per_device_eval_batch_size 8 \
17 |    --max_seq_len 512 \
18 |    --learning_rate 1e-3 \
19 |    --weight_decay 0.1 \
20 |    --num_train_epochs 16 \
21 |    --gradient_accumulation_steps 1 \
22 |    --lr_scheduler_type cosine \
23 |    --num_warmup_steps 0 \
24 |    --seed 1234 \
25 |    --zero_stage 0 \
26 |    --lora_dim 128 \
27 |    --lora_module_name decoder.layers. \
28 |    --only_optimize_lora \
29 |    --deepspeed \
30 |    --output_dir $OUTPUT_PATH \
31 |    &> $OUTPUT_PATH/training.log
32 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_13b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-13b \
20 |    --per_device_train_batch_size 4 \
21 |    --per_device_eval_batch_size 4 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 1e-4 \
24 |    --weight_decay 0. \
25 |    --num_train_epochs 16  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --lora_dim 128 \
33 |    --lora_module_name decoder.layers. \
34 |    --deepspeed \
35 |    --output_dir $OUTPUT \
36 |    &> $OUTPUT/training.log
37 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_30b_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT_PATH=./output
 7 | mkdir -p $OUTPUT_PATH
 8 | 
 9 | deepspeed main.py \
10 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
11 |    --data_split 2,4,4 \
12 |    --model_name_or_path facebook/opt-30b \
13 |    --per_device_train_batch_size 4 \
14 |    --per_device_eval_batch_size 4 \
15 |    --max_seq_len 512 \
16 |    --learning_rate 9.65e-6 \
17 |    --weight_decay 0. \
18 |    --num_train_epochs 16  \
19 |    --gradient_accumulation_steps 1 \
20 |    --lr_scheduler_type cosine \
21 |    --num_warmup_steps 0 \
22 |    --seed 1234 \
23 |    --lora_dim 128 \
24 |    --gradient_checkpointing \
25 |    --zero_stage 3 \
26 |    --deepspeed \
27 |    --output_dir $OUTPUT_PATH \
28 |    &> $OUTPUT_PATH/training.log
29 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_6.7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-6.7b \
20 |    --per_device_train_batch_size 6 \
21 |    --per_device_eval_batch_size 6 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0. \
25 |    --num_train_epochs 16  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --output_dir $OUTPUT \
34 |    &> $OUTPUT/training.log
35 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed Characterization Script
 2 | 
 3 | # Contents
 4 |    * [Introduction](#introduction)
 5 |    * [Usage](#usage)
 6 | 
 7 | # Introduction
 8 | The step 1 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
 9 | <pre>
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | Lora: True, False
13 | </pre>
14 | 
15 | The `run_step1_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
16 | 
17 | # Usage
18 | The sweep script can be run as follows:
19 | <pre>
20 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning$ bash training_scripts/opt/single_node/sweep/run_step1_sweep.sh
21 | </pre>
22 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | for z in {2..3}
 7 | do
 8 |     for offload in true false
 9 |     do
10 |         for lora in true false
11 |         do
12 |             cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
13 |                 ${z} \
14 |                 ${offload} \
15 |                 ${lora} \
16 |                 z${z}_offload_${offload}_lora_${lora}"
17 |             echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
18 |             echo $cmd
19 |             $cmd
20 |             pkill -9 python
21 |             sleep 60
22 |             echo ""
23 |         done
24 |     done
25 | done
26 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/other_language/run_chinese.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=2
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | # The Chinese data we found mostly only contain one response without another
17 | # "rejected" response. Thus we only test the step 1 finetuning and use
18 | # a data_split of 10,0,0 (keep all data for step 1).
19 | deepspeed main.py \
20 |    --data_path wangrui6/Zhihu-KOL Cohere/miracl-zh-queries-22-12 Hello-SimpleAI/HC3-Chinese mkqa-Chinese \
21 |    --data_split 10,0,0 \
22 |    --model_name_or_path bigscience/bloom-1b1 \
23 |    --per_device_train_batch_size 8 \
24 |    --per_device_eval_batch_size 8 \
25 |    --max_seq_len 512 \
26 |    --learning_rate 9.65e-6 \
27 |    --weight_decay 0. \
28 |    --num_train_epochs 16 \
29 |    --gradient_accumulation_steps 1 \
30 |    --lr_scheduler_type cosine \
31 |    --num_warmup_steps 0 \
32 |    --seed 1234 \
33 |    --zero_stage $ZERO_STAGE \
34 |    --deepspeed \
35 |    --output_dir $OUTPUT \
36 |    &> $OUTPUT/training.log
37 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/other_language/run_japanese.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=2
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | # The Japanese data we found mostly only contain one response without another
17 | # "rejected" response. Thus we only test the step 1 finetuning and use
18 | # a data_split of 10,0,0 (keep all data for step 1).
19 | deepspeed main.py \
20 |    --data_path mkqa-Japanese Cohere/miracl-ja-queries-22-12 lmqg/qg_jaquad lmqg/qag_jaquad \
21 |    --data_split 10,0,0 \
22 |    --model_name_or_path sberbank-ai/mGPT \
23 |    --per_device_train_batch_size 8 \
24 |    --per_device_eval_batch_size 8 \
25 |    --max_seq_len 512 \
26 |    --learning_rate 9.65e-6 \
27 |    --weight_decay 0. \
28 |    --num_train_epochs 16 \
29 |    --gradient_accumulation_steps 1 \
30 |    --lr_scheduler_type cosine \
31 |    --num_warmup_steps 0 \
32 |    --seed 1234 \
33 |    --zero_stage $ZERO_STAGE \
34 |    --deepspeed \
35 |    --output_dir $OUTPUT \
36 |    &> $OUTPUT/training.log
37 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 | 
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
4 | `` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``.
5 | 
6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
7 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
20 |    --per_device_train_batch_size 8 \
21 |    --per_device_eval_batch_size 8 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0.1 \
25 |    --num_train_epochs 1  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --offload \
34 |    --output_dir $OUTPUT \
35 |    &> $OUTPUT/training.log
36 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
20 |    --per_device_train_batch_size 8 \
21 |    --per_device_eval_batch_size 8 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0.1 \
25 |    --num_train_epochs 1  \
26 |    --gradient_accumulation_steps 1 \
27 |    --lr_scheduler_type cosine \
28 |    --num_warmup_steps 0 \
29 |    --seed 1234 \
30 |    --gradient_checkpointing \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --offload \
34 |    --lora_dim 128 \
35 |    --lora_module_name "layers." \
36 |    --output_dir $OUTPUT \
37 |    &> $OUTPUT/training.log
38 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-350m \
20 |    --per_device_train_batch_size 2 \
21 |    --per_device_eval_batch_size 2 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 5e-5 \
24 |    --weight_decay 0.1 \
25 |    --dropout 0.0 \
26 |    --num_train_epochs 1 \
27 |    --gradient_accumulation_steps 1 \
28 |    --lr_scheduler_type cosine \
29 |    --num_warmup_steps 0 \
30 |    --seed 1234 \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --output_dir $OUTPUT \
34 |    &> $OUTPUT/training.log
35 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
17 |    --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
18 |    --enable_tensorboard \
19 |    --tensorboard_path $OUTPUT \
20 |    --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
21 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-350m \
20 |    --per_device_train_batch_size 4 \
21 |    --per_device_eval_batch_size 4 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 5e-5 \
24 |    --weight_decay 0.1 \
25 |    --num_train_epochs 1 \
26 |    --dropout 0.0 \
27 |    --gradient_accumulation_steps 1 \
28 |    --lr_scheduler_type cosine \
29 |    --num_warmup_steps 0 \
30 |    --seed 1234 \
31 |    --zero_stage $ZERO_STAGE \
32 |    --deepspeed \
33 |    --output_dir $OUTPUT \
34 |    &> $OUTPUT/training.log
35 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed Characterization Script
 2 | 
 3 | # Contents
 4 |    * [Introduction](#introduction)
 5 |    * [Usage](#usage)
 6 | 
 7 | # Introduction
 8 | The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
 9 | <pre>
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | </pre>
13 | 
14 | The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
15 | 
16 | # Usage
17 | The sweep script can be run as follows:
18 | <pre>
19 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
20 | </pre>
21 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | for z in {2..3}
 7 | do
 8 |     for offload in true false
 9 |     do
10 |         cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
11 |             ${z} \
12 |             ${offload} \
13 |             z${z}_offload_${offload}"
14 |         echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
15 |         echo $cmd
16 |         $cmd
17 |         pkill -9 python
18 |         sleep 60
19 |         echo ""
20 |     done
21 | done
22 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/evaluation_scripts/run_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # Add the path to the finetuned model
 8 | python  rw_eval.py \
 9 |     --model_name_or_path
10 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
2 | 
3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
4 | `` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``.
5 | 
6 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
7 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
20 |    --per_device_train_batch_size 8 \
21 |    --per_device_eval_batch_size 8 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0.1 \
25 |    --num_padding_at_beginning 0 \
26 |    --num_train_epochs 1  \
27 |    --gradient_accumulation_steps 1 \
28 |    --lr_scheduler_type cosine \
29 |    --num_warmup_steps 0 \
30 |    --seed 1234 \
31 |    --gradient_checkpointing \
32 |    --zero_stage $ZERO_STAGE \
33 |    --deepspeed \
34 |    --offload \
35 |    --output_dir $OUTPUT \
36 |    &> $OUTPUT/training.log
37 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=3
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
20 |    --per_device_train_batch_size 8 \
21 |    --per_device_eval_batch_size 8 \
22 |    --max_seq_len 512 \
23 |    --learning_rate 9.65e-6 \
24 |    --weight_decay 0.1 \
25 |    --num_padding_at_beginning 0 \
26 |    --num_train_epochs 1  \
27 |    --gradient_accumulation_steps 1 \
28 |    --lr_scheduler_type cosine \
29 |    --num_warmup_steps 0 \
30 |    --seed 1234 \
31 |    --gradient_checkpointing \
32 |    --zero_stage $ZERO_STAGE \
33 |    --deepspeed \
34 |    --offload \
35 |    --lora_dim 128 \
36 |    --lora_module_name "layers." \
37 |    --output_dir $OUTPUT \
38 |    &> $OUTPUT/training.log
39 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/multi_node/run_350m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-350m \
20 |    --num_padding_at_beginning 1 \
21 |    --per_device_train_batch_size 2 \
22 |    --per_device_eval_batch_size 2 \
23 |    --max_seq_len 512 \
24 |    --learning_rate 5e-5 \
25 |    --weight_decay 0.1 \
26 |    --dropout 0.0 \
27 |    --num_train_epochs 1 \
28 |    --gradient_accumulation_steps 1 \
29 |    --lr_scheduler_type cosine \
30 |    --num_warmup_steps 0 \
31 |    --seed 1234 \
32 |    --zero_stage $ZERO_STAGE \
33 |    --deepspeed \
34 |    --output_dir $OUTPUT \
35 |    &> $OUTPUT/training.log
36 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
17 |    --num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
18 |    --enable_tensorboard \
19 |    --tensorboard_path $OUTPUT \
20 |    --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
21 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | OUTPUT=$1
 7 | ZERO_STAGE=$2
 8 | if [ "$OUTPUT" == "" ]; then
 9 |     OUTPUT=./output
10 | fi
11 | if [ "$ZERO_STAGE" == "" ]; then
12 |     ZERO_STAGE=0
13 | fi
14 | mkdir -p $OUTPUT
15 | 
16 | deepspeed main.py \
17 |    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
18 |    --data_split 2,4,4 \
19 |    --model_name_or_path facebook/opt-350m \
20 |    --num_padding_at_beginning 1 \
21 |    --per_device_train_batch_size 4 \
22 |    --per_device_eval_batch_size 4 \
23 |    --max_seq_len 512 \
24 |    --learning_rate 5e-5 \
25 |    --weight_decay 0.1 \
26 |    --num_train_epochs 1 \
27 |    --dropout 0.0 \
28 |    --gradient_accumulation_steps 1 \
29 |    --lr_scheduler_type cosine \
30 |    --num_warmup_steps 0 \
31 |    --seed 1234 \
32 |    --zero_stage $ZERO_STAGE \
33 |    --deepspeed \
34 |    --output_dir $OUTPUT \
35 |    &> $OUTPUT/training.log
36 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed Characterization Script
 2 | 
 3 | # Contents
 4 |    * [Introduction](#introduction)
 5 |    * [Usage](#usage)
 6 | 
 7 | # Introduction
 8 | The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
 9 | <pre>
10 | Zero Stage: 2, 3
11 | Offload: True, False
12 | </pre>
13 | 
14 | The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
15 | 
16 | # Usage
17 | The sweep script can be run as follows:
18 | <pre>
19 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
20 | </pre>
21 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | for z in {2..3}
 7 | do
 8 |     for offload in true false
 9 |     do
10 |         cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
11 |             ${z} \
12 |             ${offload} \
13 |             z${z}_offload_${offload}"
14 |         echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
15 |         echo $cmd
16 |         $cmd
17 |         pkill -9 python
18 |         sleep 60
19 |         echo ""
20 |     done
21 | done
22 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/README.md:
--------------------------------------------------------------------------------
 1 | ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
 2 | 
 3 | If you want to change your model such as EleutherAI/gpt-j-6b, you may simply update
 4 | ``` --actor_model_name_or_path ${step1_path} --critic_model_name_or_path ${step2_path} ```.
 5 | 
 6 | If you don't have step 1 and step 2 models. You may simply try
 7 | ``` bash
 8 | --actor_model_name_or_path facebook/opt-1.3b --critic_model_name_or_path facebook/opt-350m
 9 | ```
10 | ⚡⚡⚡ When you use above script, please make sure you modify parameter `rlhf_training` to False when calling the `create_critic_model` function twice in [rlhf_engine.py](./../../step3_rlhf_finetuning/rlhf_engine.py) such that it won't load the model weight from previous paths.
11 | 
12 | For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
13 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | ACTOR_MODEL_PATH=$1
 7 | CRITIC_MODEL_PATH=$2
 8 | ACTOR_ZERO_STAGE=$3
 9 | CRITIC_ZERO_STAGE=$4
10 | OUTPUT=$5
11 | if [ "$OUTPUT" == "" ]; then
12 |     OUTPUT=./output
13 | fi
14 | if [ "$ACTOR_ZERO_STAGE" == "" ]; then
15 |     ACTOR_ZERO_STAGE=0
16 | fi
17 | if [ "$CRITIC_ZERO_STAGE" == "" ]; then
18 |     CRITIC_ZERO_STAGE=0
19 | fi
20 | mkdir -p $OUTPUT
21 | 
22 | deepspeed --num_gpus 1 main.py \
23 |    --actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \
24 |    --actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \
25 |    --num_padding_at_beginning 1 --gradient_accumulation_steps 2 \
26 |    --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0 \
27 |    --output_dir $OUTPUT &> $OUTPUT/training.log
28 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed Characterization Script
 2 | 
 3 | # Contents
 4 |    * [Introduction](#introduction)
 5 |    * [Usage](#usage)
 6 | 
 7 | # Introduction
 8 | The step 3 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
 9 | <pre>
10 | Zero Stage: 2, 3
11 | Hybrid Engine: True, False
12 | Offload: True, False
13 | Lora: True, False
14 | </pre>
15 | 
16 | The `run_step3_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
17 | 
18 | # Usage
19 | The sweep script can be run as follows:
20 | <pre>
21 | DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/opt/single_node/sweep/run_step3_sweep.sh
22 | </pre>
23 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/banner.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/ceos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/ceos.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/friends.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/friends.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/hero-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/hero-figure.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/assets/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/assets/model.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/chat/README.md:
--------------------------------------------------------------------------------
1 | We provide a CLI interface for uses to test their trained chat model. First of all, please note that you need to provide both the trained checkpoint and the original language model & vision encoder paths. The model is first initialized and then loads the trained checkpoint. Also, please note that if you used multi-modal causal attention during your training, remember to put --enable_mmca_attention for in your chat script.


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/chat/chat_scripts/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | MAIN_PATH=$1
 7 | 
 8 | VISION_ENCODER=/blob/transformers_cache/qwen-clip
 9 | LLM=/blob/transformers_cache/Llama-2-13b-hf
10 | 
11 | export CUDA_VISIBLE_DEVICES=0  # Do multi single evaluation 
12 | # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7  # Do multi gpu evaluation for large models (single GPU is not enough)
13 | 
14 | 
15 | python chat.py \
16 |     --lm_model_name_or_path  $LLM \
17 |     --vision_model_name_or_path $VISION_ENCODER \
18 |     --checkpoint_path $MAIN_PATH --enable_mmca_attention
19 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/eval_single.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "cat_images1": [["please describe the image", "./eval/eval_data/images/cats/cat.png"]],
 3 | "cat_images2": [["can you describe the image", "./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg"]],
 4 | "cat_images3": [["please describe the image", "./eval/eval_data/images/cats/british_shorthair.jpg"]],
 5 | "extreme_ironing": [["What is unusual about this image?", "./eval/eval_data/images/singles/extreme_ironing.jpg"]],
 6 | "waterview": [["What are the things I should be cautious about when I visit here?", "./eval/eval_data/images/singles/waterview.jpg"]],
 7 | "art-dog": [["can you describe the image", "./eval/eval_data/images/singles/202160027_b319c4166e.jpg"]],
 8 | "funny-phone": [["What is funny about this image? Describe it panel by panel.", "./eval/eval_data/images/singles/1.jpg"]],
 9 | "squirrel": [["Why would a person find this image funny?", "./eval/eval_data/images/singles/2.jpg"]],
10 | "art-painting": [["Tell me about this work of art.", "./eval/eval_data/images/singles/50.jpg"]]
11 | }
12 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/eval/eval_scripts/run_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | # DeepSpeed Team
 5 | 
 6 | #EVAL_DATSET=eval_robustness  eval_single eval_comprehensive (see the json in the folder ./eval_data/*.json)
 7 | MAIN_PATH=$1
 8 | VISION_MODEL=/blob/transformers_cache/qwen-clip #openai/clip-vit-large-patch14
 9 | LLM=/blob/transformers_cache/Llama-2-13b-hf #meta-llama/Llama-2-7b 
10 | for EVAL_DATSET in eval_single eval_comprehensive eval_robustness 
11 | do
12 |     SAVE_PATH=eval/results/${EVAL_DATSET}
13 |     mkdir ${SAVE_PATH}
14 |     for CKPT_NAME in final best_eval
15 |     do
16 |     #NOTE: to run multi-GPU, you simple do "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7;"
17 |     export CUDA_VISIBLE_DEVICES=0; python eval/batch_generation.py --model_name dsvl --vis_proj baseline --max_seq_len 4096 \
18 |         --lm_model_name_or_path  ${LLM} --vision_model_name_or_path ${VISION_MODEL} \
19 |         --checkpoint_path $MAIN_PATH  --checkpoint_names $CKPT_NAME --eval_data ${EVAL_DATSET} \
20 |         --enable_mmca_attention --output_filename ${SAVE_PATH}/ours_${CKPT_NAME} &> ${SAVE_PATH}/ours_${CKPT_NAME}.log
21 |     done
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/helper/README.md:
--------------------------------------------------------------------------------
1 | # QWen-VL's Vision Encoder
2 | The extract_qwen_vl.py can be used to extract the vision encoder from QWen-VL. After extraction, you can find other necessary files in the [folder](./qwen_clip).


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/helper/extract_qwen_vl.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM
 2 | import torch
 3 | 
 4 | PATH = "Qwen/Qwen-VL-Chat"
 5 | 
 6 | model = AutoModelForCausalLM.from_pretrained(PATH, device_map="cuda", trust_remote_code=True).eval()
 7 | 
 8 | state_dict = model.state_dict()
 9 | save_dict = {}
10 | for k,v in state_dict.items():
11 |     if 'visual' in k:
12 |         if 'transformer.visual.proj' not in k: # we don't need the proj layer
13 |             save_dict[k.replace('transformer.visual.', '')] = v
14 | torch.save(save_dict, './qwen_clip/pytorch_model.bin')


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/helper/qwen_clip/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crop_size": 448,
 3 |   "do_center_crop": true,
 4 |   "do_normalize": true,
 5 |   "do_resize": true,
 6 |   "feature_extractor_type": "CLIPFeatureExtractor",
 7 |   "image_mean": [
 8 |     0.48145466,
 9 |     0.4578275,
10 |     0.40821073
11 |   ],
12 |   "image_std": [
13 |     0.26862954,
14 |     0.26130258,
15 |     0.27577711
16 |   ],
17 |   "resample": 3,
18 |   "size": 448
19 | }
20 | 


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets>=2.8.0
 2 | sentencepiece>=0.1.97
 3 | protobuf==3.20.3
 4 | accelerate>=0.15.0
 5 | open_clip_torch
 6 | deepspeed>=0.10.3
 7 | einops
 8 | einops_exts
 9 | transformers==4.33.3
10 | transformers_stream_generator
11 | termcolor


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/utils/data/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
2 | 
3 | from .builder import build_dataset  # noqa: F401
4 | from .vqa_dataset import VQADataset  # noqa: F401
5 | from .utils import DataCollatorPadToMaxLen, split_dataset, shuffle_dataset  # noqa: F401
6 | from .DST import add_special_token


--------------------------------------------------------------------------------
/applications/DeepSpeed-VisualChat/utils/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling_dsvl import create_dsvl_model_and_transforms


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | All benchmarks that use the DeepSpeed library are maintained in this folder. We welcome contributions in this space! 
2 | 


--------------------------------------------------------------------------------
/benchmarks/communication/__init__.py:
--------------------------------------------------------------------------------
1 | '''Copyright The Microsoft DeepSpeed Team'''


--------------------------------------------------------------------------------
/benchmarks/communication/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from deepspeed.accelerator import get_accelerator
 7 | 
 8 | DEFAULT_WARMUPS = 5
 9 | DEFAULT_TRIALS = 50
10 | DEFAULT_TYPE = 'float'
11 | DEFAULT_BACKEND = get_accelerator().communication_backend_name()
12 | DEFAULT_UNIT = 'Gbps'
13 | DEFAULT_DIST = 'deepspeed'
14 | DEFAULT_MAXSIZE = 24
15 | DEFAULT_DEVICE = 'cuda'
16 | TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
17 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | *.pyc
3 | *.png
4 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/ddp_config.yaml.template:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | machine_rank: {{ machine_rank }}
 5 | main_training_function: main
 6 | mixed_precision: bf16
 7 | num_machines: {{ num_machines }}
 8 | num_processes: {{ num_processes }}
 9 | rdzv_backend: static
10 | same_network: true
11 | tpu_env: []
12 | tpu_use_cluster: false
13 | tpu_use_sudo: false
14 | use_cpu: false
15 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/ds_config.json.template:
--------------------------------------------------------------------------------
 1 | {
 2 |     {% if fp16 %}
 3 |     "fp16": {
 4 |         "enabled": true,
 5 |         "initial_scale_power": 8
 6 |     },
 7 |     {% else %}
 8 |     "bf16": {
 9 |         "enabled": true
10 |     },
11 |     {% endif %}
12 |     "zero_optimization": {
13 |         "stage": {{ zero_stage }},
14 |         "sub_group_size": 100000000
15 |     },
16 |     "compile": {
17 |         "deepcompile": {{ deepcompile }},
18 |         "offload_activation": false,
19 |         "offload_opt_states": false,
20 |         "double_buffer": true,
21 |         "symmetric_memory": false,
22 |         "free_activation": false,
23 |         "debug_log": {{ debug_log }},
24 |         "sync_before_reduce": {{ sync_before_reduce }},
25 |         "sync_after_reduce": {{ sync_after_reduce }}
26 |     },
27 |     "gradient_accumulation_steps": {{ gradient_accumulation_steps }},
28 |     "gradient_clipping": "auto",
29 |     "steps_per_print": 2000,
30 |     "train_batch_size": "auto",
31 |     "train_micro_batch_size_per_gpu": "auto",
32 |     "wall_clock_breakdown": false
33 | }


--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/ds_config.yaml.template:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   {%- if zero_stage == 3 %}
 6 |   zero3_init_flag: true
 7 |   {%- endif %}
 8 |   deepspeed_config_file: configs/ds_config.json
 9 | distributed_type: DEEPSPEED
10 | machine_rank: {{ machine_rank }}
11 | main_training_function: main
12 | num_machines: {{ num_machines }}
13 | num_processes: {{ num_processes }}
14 | rdzv_backend: static
15 | same_network: true
16 | tpu_env: []
17 | tpu_use_cluster: false
18 | tpu_use_sudo: false
19 | use_cpu: false


--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/fsdp_config.yaml.template:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | fsdp_config:
 5 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 6 |   fsdp_backward_prefetch: BACKWARD_PRE
 7 |   fsdp_cpu_ram_efficient_loading: true
 8 |   fsdp_forward_prefetch: false
 9 |   fsdp_offload_params: false
10 |   {%- if zero_stage == 3 %}
11 |   fsdp_sharding_strategy: FULL_SHARD
12 |   {%- else %}
13 |   fsdp_sharding_strategy: SHARD_GRAD_OP
14 |   {%- endif %}
15 |   fsdp_state_dict_type: SHARDED_STATE_DICT
16 |   fsdp_sync_module_states: true
17 |   fsdp_use_orig_params: true
18 | machine_rank: {{ machine_rank }}
19 | main_training_function: main
20 | mixed_precision: bf16
21 | num_machines: {{ num_machines }}
22 | num_processes: {{ num_processes }}
23 | rdzv_backend: static
24 | same_network: true
25 | tpu_env: []
26 | tpu_use_cluster: false
27 | tpu_use_sudo: false
28 | use_cpu: false
29 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/configs/singlegpu_config.yaml.template:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: NO
4 | main_training_function: main
5 | mixed_precision: bf16
6 | use_cpu: false
7 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/hostfile_n4:
--------------------------------------------------------------------------------
1 | node-0 slots=8
2 | node-1 slots=8
3 | node-2 slots=8
4 | node-3 slots=8
5 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs1.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs2.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Llama-3-70B_np32_bs4.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs2.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1/throughput/chart_throughput_Mixtral-8x7B_np32_bs4.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Llama-3-70B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Llama-3-70B_np32_bs1.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/deepcompile/results/acc_step_1_16/throughput/chart_throughput_Mixtral-8x7B_np32_bs1.png


--------------------------------------------------------------------------------
/benchmarks/deepcompile/run_bench_offload.sh:
--------------------------------------------------------------------------------
 1 | PROFILE_DIR=${PROFILE_DIR:-"profile_offload"}
 2 | mkdir -p ${PROFILE_DIR}
 3 | PROFILE_OPTS="--profile --profile-dir ${PROFILE_DIR}"
 4 | COMPILE_OPTS="--compile"
 5 | DC_OPTS="--compile --deepcompile"
 6 | ACC_OPTS="--gradient-accumulation-steps 1"
 7 | AC_OPTS="--activation-checkpointing"
 8 | 
 9 | mkdir -p logs
10 | 
11 | export LOG_BASE="logs_offload"
12 | mkdir -p ${LOG_BASE}
13 | 
14 | MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
15 | BATCH_SIZE_OPTS=(1)
16 | SEQ_LENGTH_OPTS=(1024)
17 | for BATCH_SIZE in ${BATCH_SIZE_OPTS[@]}; do
18 |     for SEQ_LENGTH in ${SEQ_LENGTH_OPTS[@]}; do
19 |         ARGS="--model ${MODEL} --batch-size ${BATCH_SIZE} --seq-length ${SEQ_LENGTH} ${ACC_OPTS} ${AC_OPTS} ${PROFILE_OPTS}"
20 |         bash ./run.sh --backend deepspeed ${ARGS} --zero-stage 3
21 |         bash ./run.sh --backend deepspeed ${ARGS} --zero-stage 3 --ds-offload
22 |         bash ./run.sh --backend deepspeed ${ARGS} ${DC_OPTS} --zero-stage 3 --eager --passes offload_adam_states
23 |         bash ./run.sh --backend deepspeed ${ARGS} ${DC_OPTS} --zero-stage 3 --eager --passes offload_adam_states_sync
24 |     done
25 | done
26 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/run_bench_z1.sh:
--------------------------------------------------------------------------------
 1 | PROFILE_DIR=${PROFILE_DIR:-profiles}
 2 | mkdir -p ${PROFILE_DIR}
 3 | PROFILE_OPTS="--profile --profile-dir ${PROFILE_DIR}"
 4 | COMPILE_OPTS="--compile"
 5 | DC_OPTS="--compile --deepcompile"
 6 | ACC_OPTS="--gradient-accumulation-steps 1"
 7 | AC_OPTS="--activation-checkpointing"
 8 | 
 9 | export NUM_NODES=${NUM_NODES:-4}
10 | 
11 | MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
12 | BATCH_SIZE_OPTS=(1 2 4)
13 | SEQ_LENGTH_OPTS=(512 1024 2048)
14 | for BATCH_SIZE in ${BATCH_SIZE_OPTS[@]}; do
15 |     for SEQ_LENGTH in ${SEQ_LENGTH_OPTS[@]}; do
16 |         ARGS="--model ${MODEL} --batch-size ${BATCH_SIZE} --seq-length ${SEQ_LENGTH} --zero-stage 1 ${ACC_OPTS} ${AC_OPTS}"
17 |         bash ./run_multinode.sh --backend deepspeed ${ARGS}
18 |         bash ./run_multinode.sh --backend deepspeed ${ARGS} ${COMPILE_OPTS}
19 |         bash ./run_multinode.sh --backend deepspeed ${ARGS} ${DC_OPTS}
20 | 
21 |         cp -r logs ${PROFILE_DIR}/
22 |     done
23 | done
24 | 


--------------------------------------------------------------------------------
/benchmarks/deepcompile/run_multinode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $*
 4 | 
 5 | SCRIPT_DIR=$(dirname $(realpath $0))
 6 | HOST_IP=$(hostname -i)
 7 | NUM_NODES=${NUM_NODES:-1}
 8 | 
 9 | # verify that NUM_NODES is a positive integer
10 | if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then
11 |     echo "Error: NUM_NODES must be a positive integer"
12 |     exit 1
13 | fi
14 | 
15 | # check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists
16 | if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then
17 |     echo "Error: hostfile_n${NUM_NODES} does not exist"
18 |     exit 1
19 | fi
20 | 
21 | if [ "${NUM_NODES}" == "1" ]; then
22 |     # avoid dependency on pdsh when possible
23 |     cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*
24 | else
25 |     ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*"
26 | fi
27 | 


--------------------------------------------------------------------------------
/benchmarks/inference/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/128k-120.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 128000
2 | prompt_length_var: 0.1
3 | max_prompt_length: 131072
4 | max_new_tokens: 120
5 | max_new_tokens_var: 0.3
6 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/1300-120.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 1300
2 | prompt_lenght_var: 0.3
3 | max_new_tokens: 120
4 | max_new_tokens_var: 0.3
5 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/2600-60.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 2600
2 | prompt_lenght_var: 0.3
3 | max_new_tokens: 60
4 | max_new_tokens_var: 0.3
5 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/configs/500-500.yaml:
--------------------------------------------------------------------------------
1 | prompt_length: 500
2 | prompt_lenght_var: 0.3
3 | max_new_tokens: 500
4 | max_new_tokens_var: 0.3
5 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | [project]
 5 | name = "deepspeedometer"
 6 | version = "0.0.1"
 7 | authors = [
 8 |   { name="Ammar Ahmad Awan", email="ammar.awan@microsoft.com" },
 9 |   { name="Arash Bakhitiari", email="abakhtiari@microsoft.com" },
10 |   { name="Connor Holmes"},
11 |   { name="Lev Kurilenko", email="lev.kurilenko@microsoft.com" },
12 |   { name="Heyang Qin", email="heyangqin@microsoft.com" },
13 |   { name="Masahiro Tanaka", email="mtanaka@microsoft.com" },
14 |   { name="Michael Wyatt", email="michaelwyatt@microsoft.com" },
15 | ]
16 | description = "LLM benchmarking tool"
17 | readme = "README.md"
18 | requires-python = ">=3.8"
19 | classifiers = [
20 |     "Programming Language :: Python :: 3",
21 | ]
22 | dependencies = [
23 |     "loguru",
24 |     "pydantic>=2.0.0",
25 |     "torch",
26 |     "tqdm",
27 |     "transformers",
28 | ]
29 | 
30 | [project.urls]
31 | Homepage = "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference/deepspeedometer"
32 | Issues = "https://github.com/deepspeedai/DeepSpeedExamples/issues"
33 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/run_example.sh:
--------------------------------------------------------------------------------
1 | python -m src.deepspeedometer.benchmark_runner --model "facebook/opt-125m" --api dummy --config_file ./configs/1300-120.yaml
2 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py:
--------------------------------------------------------------------------------
1 | from .arg_parsing import parse_args_to_configs
2 | from .benchmark_runner import BenchmarkRunner
3 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseClient
 2 | 
 3 | from .azure_ml_client import AzureMLClientConfig, AzureMLClient
 4 | from .dummy_client import DummyClientConfig, DummyClient
 5 | from .fastgen_client import FastGenClientConfig, FastGenClient
 6 | from .vllm_client import vLLMClientConfig, vLLMClient
 7 | from .openai_client import openaiClientConfig, openaiClient
 8 | 
 9 | client_config_classes = {
10 |     "dummy": DummyClientConfig,
11 |     "azure_ml": AzureMLClientConfig,
12 |     "fastgen": FastGenClientConfig,
13 |     "vllm": vLLMClientConfig,
14 |     "openai": openaiClientConfig
15 | }
16 | client_classes = {
17 |     "dummy": DummyClient,
18 |     "azure_ml": AzureMLClient,
19 |     "fastgen": FastGenClient,
20 |     "vllm": vLLMClient,
21 |     "openai": openaiClient,
22 | }
23 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict
 3 | 
 4 | from ..config import BaseConfigModel
 5 | from ..prompt import Prompt
 6 | 
 7 | 
 8 | class BaseClient(ABC):
 9 |     def __init__(self, config: BaseConfigModel) -> None:
10 |         self.config = config
11 | 
12 |     @abstractmethod
13 |     def start_service(self) -> None:
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def stop_service(self) -> None:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def send_request(self, request_kwargs: Dict[str, Any]) -> Any:
26 |         pass
27 | 
28 |     @abstractmethod
29 |     def process_response(self, raw_response: Any) -> str:
30 |         pass
31 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, ConfigDict
 2 | 
 3 | 
 4 | class BaseConfigModel(BaseModel):
 5 |     model_config = ConfigDict(
 6 |         validate_default=True,
 7 |         validate_assignment=False,
 8 |         use_enum_values=True,
 9 |         populate_by_name=True,
10 |         extra="forbid",
11 |         arbitrary_types_allowed=True,
12 |         protected_namespaces=(),
13 |     )
14 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import asdict, dataclass
 2 | from typing import Any
 3 | 
 4 | 
 5 | @dataclass
 6 | class Response:
 7 |     prompt_text: str = ""
 8 |     prompt_tokens: int = 0
 9 |     generated_output: str = ""
10 |     generated_tokens: int = 0
11 |     request_time: float = 0
12 |     raw_response: Any = None
13 |     client_id: int = 0
14 | 
15 |     def to_dict(self) -> dict:
16 |         return asdict(self)
17 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/README.md:
--------------------------------------------------------------------------------
1 | To run the unit tests:
2 | 
3 | `python3 -m pytest .`


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/inference/deepspeedometer/tests/__init__.py


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_benchmark.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from deepspeedometer import parse_args_to_configs, BenchmarkRunner
 4 | 
 5 | 
 6 | def test_benchmark_runner(benchmark_args, num_clients):
 7 |     benchmark_config, client_config = parse_args_to_configs(benchmark_args)
 8 |     benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
 9 |     benchmark_runner.run()
10 | 
11 |     expected_results = sum(1 for _ in benchmark_runner._benchmark_settings()) * len(
12 |         num_clients
13 |     )
14 |     actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json")))
15 |     assert (
16 |         expected_results == actual_results
17 |     ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})."
18 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import yaml
 4 | 
 5 | import pydantic
 6 | 
 7 | from deepspeedometer import BenchmarkRunner, parse_args_to_configs
 8 | 
 9 | 
10 | def test_config(benchmark_args):
11 |     benchmark_config, client_config = parse_args_to_configs(benchmark_args)
12 | 
13 | 
14 | @pytest.mark.parametrize("model", [""])
15 | def test_config_required_fail(benchmark_args):
16 |     with pytest.raises(pydantic.ValidationError):
17 |         benchmark_config, client_config = parse_args_to_configs(benchmark_args)
18 | 
19 | 
20 | @pytest.mark.parametrize("num_config_files", [1])
21 | def test_config_file(benchmark_args, config_files, num_clients):
22 |     # Create a config that would generate 6 benchmark settings
23 |     config = {"max_prompt_length": [500, 1300, 2600], "num_clients": [1, 2]}
24 |     with open(config_files[0], "w") as f:
25 |         yaml.dump(config, f)
26 | 
27 |     benchmark_config, client_config = parse_args_to_configs(benchmark_args)
28 |     benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
29 |     benchmark_settings = sum(1 for _ in benchmark_runner._benchmark_settings()) * len(
30 |         num_clients
31 |     )
32 |     assert benchmark_settings == 6
33 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_early_stop.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from deepspeedometer import parse_args_to_configs, BenchmarkRunner
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("num_clients", [(1, 2, 4)], indirect=True)
 7 | def test_early_stop(benchmark_args):
 8 |     benchmark_args += [
 9 |         "--early_stop_latency",
10 |         "1",
11 |         "--dummy_client_latency_time",
12 |         "2.0",
13 |     ]
14 |     print(benchmark_args)
15 |     benchmark_config, client_config = parse_args_to_configs(benchmark_args)
16 |     benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
17 |     benchmark_runner.run()
18 | 
19 |     expected_results = 1
20 |     actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json")))
21 |     assert (
22 |         expected_results == actual_results
23 |     ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})."
24 | 


--------------------------------------------------------------------------------
/benchmarks/inference/deepspeedometer/tests/test_prompt.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from deepspeedometer import BenchmarkRunner, parse_args_to_configs
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("prompt_length_var, max_new_tokens_var", [(0, 0)])
 7 | def test_prompt_length(benchmark_args):
 8 |     benchmark_config, client_config = parse_args_to_configs(benchmark_args)
 9 |     benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
10 |     num_clients, prompt_config = next(benchmark_runner._benchmark_settings())
11 | 
12 |     for prompt in benchmark_runner.prompt_generator(prompt_config, num_prompts=10):
13 |         prompt_length = benchmark_runner.prompt_generator.count_tokens(prompt.text)
14 |         # Using pytest.approx here because often we will have 1-off errors due to tokenization special tokens
15 |         assert prompt_length == pytest.approx(benchmark_runner.config.prompt_length, 1)
16 | 


--------------------------------------------------------------------------------
/benchmarks/inference/mii/A6000_benchmarks_example.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/benchmarks/inference/mii/A6000_benchmarks_example.PNG


--------------------------------------------------------------------------------
/benchmarks/inference/mii/plot_config.yaml:
--------------------------------------------------------------------------------
1 | label: "vLLM"
2 | color: "purple"
3 | marker: "o"
4 | linestyle: "--"
5 | polyfit_degree: 0
6 | x_max : 30
7 | y_max : 10
8 | 


--------------------------------------------------------------------------------
/benchmarks/inference/mii/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | matplotlib
3 | deepspeed-mii>=0.2.0
4 | vllm>=0.2.7
5 | numpy
6 | tabulate
7 | 


--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_all.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
 7 | 
 8 | for MODEL in ${MODELS[@]}; do
 9 |     python ./run_benchmark.py --model ${MODEL} --stream --backend fastgen
10 |     python ./run_benchmark.py --model ${MODEL} --stream --backend vllm
11 | done
12 | 
13 | # Extra runs for Mixtral with non-default settings
14 | python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend fastgen
15 | python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend vllm


--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_aml.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | # Run benchmark against AML endpoint
 7 | python ./run_benchmark.py \
 8 |         --model <model name> \
 9 |         --deployment_name <aml deployment name> \
10 |         --aml_api_url <aml endpoint URL> \
11 |         --aml_api_key <aml API key> \
12 |         --mean_prompt_length 2600 \
13 |         --mean_max_new_tokens 60 \
14 |         --num_requests 256 \
15 |         --backend aml
16 | 
17 | ### Gernerate the plots
18 | python ./src/plot_th_lat.py
19 | 
20 | echo "Find figures in ./plots/ and log outputs in ./results/"


--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_example.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | # Run benchmark
 7 | python ./run_benchmark.py \
 8 |         --model meta-llama/Llama-2-7b-hf \
 9 |         --tp_size 1 \
10 |         --num_replicas 1 \
11 |         --max_ragged_batch_size 768 \
12 |         --mean_prompt_length 2600 \
13 |         --mean_max_new_tokens 60 \
14 |         --stream \
15 |         --backend fastgen \
16 | 
17 | ### Gernerate the plots
18 | python ./src/plot_th_lat.py
19 | 
20 | echo "Find figures in ./plots/ and log outputs in ./results/"


--------------------------------------------------------------------------------
/benchmarks/inference/mii/run_fp6.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | MODELS=(NousResearch/Llama-2-70b-hf)
 7 | 
 8 | for MODEL in ${MODELS[@]}; do
 9 |     python ./run_benchmark.py --model ${MODEL} --num_requests 128 --stream --backend fastgen --fp6  --tp_size 1
10 | done


--------------------------------------------------------------------------------
/benchmarks/inference/mii/src/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/benchmarks/inference/mii/src/random_query_generator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | import random
 9 | 
10 | 
11 | class RandomQueryGenerator:
12 |     def __init__(self, input_text, tokenizer, seed):
13 |         self.input_text = input_text
14 |         self.tokenizer = tokenizer
15 | 
16 |         torch.manual_seed(seed)
17 |         random.seed(seed)
18 |         np.random.seed(seed)
19 | 
20 |     def get_random_request_text(self, length, variance, max_length, batch):
21 |         request_text = []
22 |         tokenized_input = self.tokenizer.batch_encode_plus(
23 |             [self.input_text], return_tensors="pt", padding=False
24 |         )
25 |         offset = list(range(512))
26 |         random.shuffle(offset)
27 | 
28 |         text_ids = tokenized_input["input_ids"][0]
29 |         for i in range(batch):
30 |             # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens
31 |             req_prompt_length = min(int(np.random.normal(length, variance)), max_length)
32 | 
33 |             text = self.tokenizer.decode(text_ids[i : req_prompt_length + i])
34 |             request_text.append(text)
35 |         return request_text
36 | 


--------------------------------------------------------------------------------
/benchmarks/inference/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.21.3
2 | 


--------------------------------------------------------------------------------
/benchmarks/inference/run_model.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | model=$1
 4 | dtype=$2
 5 | graphs=$3
 6 | kernel=$4
 7 | gpus=$5
 8 | 
 9 | version=0
10 | log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
11 | mkdir -p ${log_path}
12 | 
13 | params="--dtype $dtype "
14 | if [[ "$graphs" == "true" ]]; then
15 |     params+="--graphs "
16 | fi
17 | if [[ "$kernel" == "true" ]]; then
18 |     params+="--kernel "
19 | fi
20 | 
21 | echo "baseline $log_path"
22 | deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
23 | 
24 | echo "deepspeed $log_path"
25 | deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/deepspeed.log


--------------------------------------------------------------------------------
/compression/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Model Compression examples
2 | 
3 | Examples in this folder are helpful to try out some features and models that take advantage of the DeepSpeed compression library.
4 | 
5 | A detailed tutorial for understanding and using DeepSpeed model compression features can be seen from here: https://www.deepspeed.ai/tutorials/model-compression/
6 | 


--------------------------------------------------------------------------------
/compression/bert/README.md:
--------------------------------------------------------------------------------
 1 | #### Install
 2 | 
 3 | ``pip install -r requirements.txt``
 4 | 
 5 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library.
 6 | 
 7 | #### Key File: run_glue_no_trainer.py
 8 | 
 9 | The python code is modified based on [HuggingFace&#39;s PyTorch text_classification](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification). The key added feature is the implementation of knowledge distillation (KD)（--distill_method one_stage). If no KD, run (--distill_method zero_stage).
10 | 
11 | #### Folders (config, huggingface_transformer, bash_script)
12 | 
13 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction.
14 | * **huggingface_transformer:** This folder serves the implementation of knowledge distillation. It's based on [HuggingFace&#39;s transformer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
15 |   The change is line 383, where we output attention_scores instead of attention_prob.
16 | * **bash_script**  This folder contains many bash scripts for various kinds of compression. See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/).
17 | 
18 | 


--------------------------------------------------------------------------------
/compression/bert/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | transformers == 4.15.0
 3 | datasets >= 1.8.0
 4 | sentencepiece != 0.1.92
 5 | scipy
 6 | scikit-learn
 7 | protobuf
 8 | gpustat
 9 | torch >= 1.3
10 | 


--------------------------------------------------------------------------------
/compression/cifar/README.md:
--------------------------------------------------------------------------------
 1 | #### Install
 2 | 
 3 | ``pip install torch torchvision``
 4 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library.
 5 | 
 6 | #### Key File: train.py
 7 | 
 8 | The python code is modified based on (https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar). The key added feature is the compression pipeline.
 9 | 
10 | #### Folders (config)
11 | 
12 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction.
13 | 
14 | #### bash script 
15 | * **run_compress.sh**  This bash script contains jobs for training a checkpoint and then compressing this checkpoint.  See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/).
16 | 
17 | 


--------------------------------------------------------------------------------
/compression/cifar/config/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size" : 32,
 3 |     "train_micro_batch_size_per_gpu": 32,
 4 |     "steps_per_print": 50,
 5 |   
 6 |     "optimizer": {
 7 |       "type": "Adam",
 8 |       "params": {
 9 |         "lr": 0.001,
10 |         "betas": [
11 |           0.8,
12 |           0.999
13 |         ],
14 |         "eps": 1e-8,
15 |         "weight_decay": 3e-7
16 |       }
17 |     },
18 | 
19 |     "zero_optimization": {
20 |       "stage": 0
21 |     },
22 | 
23 |     "fp16":{
24 |       "enabled": true
25 |     },
26 | 
27 |     "gradient_clipping": 1.0,
28 |     "prescale_gradients": true,
29 |   
30 |     "wall_clock_breakdown" : false
31 |   }
32 | 
33 | 


--------------------------------------------------------------------------------
/compression/gpt2/README.md:
--------------------------------------------------------------------------------
 1 | #### Install
 2 | 
 3 | ``pip install -r requirements.txt``
 4 | 
 5 | You will also need to install updated DeepSpeed version (>0.7.0), which contains the compression library.
 6 | 
 7 | 
 8 | #### Key File: run_clm_no_trainer.py
 9 | 
10 | The python code is modified based on huggingface (https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py). The key added feature is the compression pipeline.
11 | 
12 | #### Folders (config)
13 | 
14 | * **config:** This folder provides DeepSpeed configuration, including quantization, pruning and layer reduction.
15 | 
16 | #### bash script 
17 | * **run_zero_quant.sh**  This bash script contains jobs for training a checkpoint and then compressing this checkpoint.  Run the job under the gpt2 directory:
18 | 
19 |  ```DeepSpeedExamples/model_compression/gpt2$ . ./bash_script/run_zero_quant.sh```
20 |  See more descriptions and results in our [tutorial page](https://www.deepspeed.ai/).
21 | 
22 | 


--------------------------------------------------------------------------------
/compression/gpt2/config/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size" : 8,
 3 |     "train_micro_batch_size_per_gpu": 4,
 4 |     "steps_per_print": 50,
 5 |   
 6 |     "optimizer": {
 7 |       "type": "Adam",
 8 |       "params": {
 9 |         "lr": 0.001,
10 |         "betas": [
11 |           0.8,
12 |           0.999
13 |         ],
14 |         "eps": 1e-8,
15 |         "weight_decay": 3e-7
16 |       }
17 |     },
18 | 
19 |     "zero_optimization": {
20 |       "stage": 0
21 |     },
22 | 
23 |     "fp16":{
24 |       "enabled": true
25 |     },
26 | 
27 |     "gradient_clipping": 1.0,
28 |     "prescale_gradients": true,
29 |   
30 |     "wall_clock_breakdown" : false
31 |   }
32 | 
33 | 


--------------------------------------------------------------------------------
/compression/gpt2/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.8.0
2 | sentencepiece != 0.1.92
3 | protobuf
4 | transformers == 4.15.0
5 | accelerate


--------------------------------------------------------------------------------
/deepnvme/file_access/aio_load_gpu_tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os, timeit, functools
 3 | from deepspeed.ops.op_builder import AsyncIOBuilder
 4 | from utils import parse_read_arguments, GIGA_UNIT
 5 | 
 6 | def file_read(inp_f, handle, bounce_buffer):
 7 |     handle.sync_pread(bounce_buffer, inp_f)
 8 |     return bounce_buffer.cuda()
 9 | 
10 | 
11 | def main():
12 |     args = parse_read_arguments()
13 |     input_file = args.input_file
14 |     file_sz = os.path.getsize(input_file)
15 |     cnt = args.loop
16 | 
17 |     aio_handle = AsyncIOBuilder().load().aio_handle()
18 |     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
19 | 
20 |     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
21 |     aio_t = t.timeit(cnt)
22 |     aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t
23 |     print(f'aio load_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec')
24 | 
25 |     if args.validate: 
26 |         from py_load_cpu_tensor import file_read as py_file_read 
27 |         aio_tensor = file_read(input_file, aio_handle, bounce_buffer).cpu()
28 |         py_tensor = py_file_read(input_file)
29 |         print(f'Validation success = {aio_tensor.equal(py_tensor)}')
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 


--------------------------------------------------------------------------------
/deepnvme/file_access/media/deepnvme_ops_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/file_access/media/deepnvme_ops_report.png


--------------------------------------------------------------------------------
/deepnvme/file_access/py_load_cpu_tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os, timeit, functools
 3 | from utils import parse_read_arguments, GIGA_UNIT
 4 | 
 5 | def file_read(inp_f):
 6 |     with open(inp_f, 'rb') as f:
 7 |        tensor = torch.frombuffer(f.read(), dtype=torch.uint8)
 8 |     return tensor
 9 | 
10 | def main():
11 |     args = parse_read_arguments()
12 |     input_file = args.input_file
13 |     file_sz = os.path.getsize(input_file)
14 |     cnt = args.loop
15 | 
16 |     t = timeit.Timer(functools.partial(file_read, input_file))
17 |     py_t = t.timeit(cnt)
18 |     py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
19 |     print(f'py load_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/deepnvme/file_access/py_load_gpu_tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os, timeit, functools
 3 | from utils import parse_read_arguments, GIGA_UNIT
 4 | 
 5 | def file_read(inp_f):
 6 |     with open(inp_f, 'rb') as f:
 7 |        tensor = torch.frombuffer(f.read(), dtype=torch.uint8)
 8 |     return tensor.cuda()
 9 | 
10 | def main():
11 |     args = parse_read_arguments()
12 |     input_file = args.input_file
13 |     file_sz = os.path.getsize(input_file)
14 |     cnt = args.loop
15 |     
16 |     t = timeit.Timer(functools.partial(file_read, input_file))
17 |     py_t = t.timeit(cnt)
18 |     py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
19 |     print(f'py load_gpu:  {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/deepnvme/file_access/py_store_cpu_tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os, timeit, functools
 3 | import pathlib
 4 | from utils import parse_write_arguments, GIGA_UNIT
 5 | 
 6 | def file_write(out_f, tensor):
 7 |     with open(out_f, 'wb') as f:
 8 |        f.write(tensor.numpy(force=True))
 9 | 
10 | def main():
11 |     args = parse_write_arguments()
12 |     cnt = args.loop
13 |     output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
14 |     pathlib.Path(output_file).unlink(missing_ok=True)
15 |     file_sz = args.mb_size*(1024**2)
16 |     cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
17 | 
18 |     t = timeit.Timer(functools.partial(file_write, output_file, cpu_tensor))
19 | 
20 |     py_t = t.timeit(cnt)
21 |     py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
22 |     print(f'py store_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
23 |     pathlib.Path(output_file).unlink(missing_ok=True)
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/deepnvme/file_access/py_store_gpu_tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os, timeit, functools
 3 | import pathlib
 4 | from utils import parse_write_arguments, GIGA_UNIT
 5 | 
 6 | def file_write(out_f, tensor):
 7 |     with open(out_f, 'wb') as f:
 8 |        f.write(tensor.numpy(force=True))
 9 | 
10 | def main():
11 |     args = parse_write_arguments()
12 |     cnt = args.loop
13 |     output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
14 |     pathlib.Path(output_file).unlink(missing_ok=True)
15 |     file_sz = args.mb_size*(1024**2)
16 |     gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
17 | 
18 |     t = timeit.Timer(functools.partial(file_write, output_file, gpu_tensor))
19 | 
20 |     py_t = t.timeit(cnt)
21 |     py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
22 |     print(f'py store_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
23 |     pathlib.Path(output_file).unlink(missing_ok=True)
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/deepnvme/file_access/run_load_tensor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -ne 1 ]]; then 
 4 |     echo "Usage: $0 <input file on nvme device>"
 5 |     exit 1 
 6 | fi 
 7 | 
 8 | input_file=$1 
 9 | if ! [[ -f "$input_file" ]]; then
10 |     echo "Error: $input_file does not exist"
11 |     exit 1 
12 | fi 
13 | 
14 | 
15 | echo "Running load tensor examples using $input_file"
16 | for f in aio_load_cpu_tensor.py aio_load_gpu_tensor.py \
17 |     gds_load_gpu_tensor.py \
18 |     py_load_cpu_tensor.py py_load_gpu_tensor.py; do 
19 |     cmd="python $f --input_file $input_file"
20 |     sync 
21 |     echo $cmd 
22 |     eval $cmd 
23 |     sleep 2
24 | done 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/deepnvme/file_access/run_store_tensor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -ne 1 ]]; then 
 4 |     echo "Usage: $0 <output folder on nvme device>"
 5 |     exit 1 
 6 | fi 
 7 | 
 8 | output_folder=$1 
 9 | if ! [[ -d "$output_folder" ]]; then
10 |     echo "Error: $output_folder does not exist"
11 |     exit 1 
12 | fi 
13 | 
14 | 
15 | echo "Running store tensor examples using $output_folder"
16 | for f in aio_store_cpu_tensor.py aio_store_gpu_tensor.py \
17 |     gds_store_gpu_tensor.py \
18 |     py_store_cpu_tensor.py py_store_gpu_tensor.py; do 
19 |     cmd="python $f --nvme_folder $output_folder"
20 |     sync 
21 |     echo $cmd 
22 |     eval $cmd 
23 |     sleep 2
24 | done 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/deepnvme/model_checkpoint/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 


--------------------------------------------------------------------------------
/deepnvme/zero_inference/media/nvme_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/nvme_config.png


--------------------------------------------------------------------------------
/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png


--------------------------------------------------------------------------------
/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png


--------------------------------------------------------------------------------
/inference/huggingface/automatic-speech-recognition/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DeepSpeed Huggingface Automatic Speech Recognition Examples
 3 | 
 4 | # Setup
 5 | Python dependencies:
 6 | <pre>
 7 | pip install -r requirements.txt
 8 | </pre>
 9 | 
10 | For the `test-wav2vec.py` speech model example, you may also need to install the `libsndfile1-dev` generic library:
11 | <pre>
12 | sudo apt-get install libsndfile1-dev
13 | </pre>
14 | 
15 | # Usage
16 | Examples can be run as follows:
17 | <pre>deepspeed --num_gpus [number of GPUs] test-[model].py</pre>
18 | 
19 | # Example Output
20 | Command:
21 | <pre>
22 | deepspeed --num_gpus 1 test-wav2vec2.py
23 | </pre>
24 | 
25 | Output:
26 | <pre>
27 | WER: 0.03383673158855752
28 | </pre>
29 | 


--------------------------------------------------------------------------------
/inference/huggingface/automatic-speech-recognition/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | soundfile
5 | jiwer
6 | datasets
7 | 


--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DeepSpeed Huggingface Fill Mask Examples
 3 | 
 4 | # Setup
 5 | Python dependencies:
 6 | <pre>
 7 | pip install -r requirements.txt
 8 | </pre>
 9 | 
10 | # Usage
11 | Examples can be run as follows:
12 | <pre>deepspeed --num_gpus [number of GPUs] test-[model].py</pre>
13 | 
14 | # Example Output
15 | Command:
16 | <pre>
17 | deepspeed --num_gpus 1 test-roberta.py
18 | </pre>
19 | 
20 | Output:
21 | <pre>
22 | [{'score': 0.40290409326553345, 'token': 3742, 'token_str': ' Internet', 'sequence': 'The invention of the Internet revolutionized the way we communicate with each other.'}, {'score': 0.20314466953277588, 'token': 7377, 'token_str': ' telephone', 'sequence': 'The invention of the telephone revolutionized the way we communicate with each other.'}, {'score': 0.17653286457061768, 'token': 2888, 'token_str': ' internet', 'sequence': 'The invention of the internet revolutionized the way we communicate with each other.'}, {'score': 0.06900821626186371, 'token': 4368, 'token_str': ' smartphone', 'sequence': 'The invention of the smartphone revolutionized the way we communicate with each other.'}, {'score': 0.03270129859447479, 'token': 3034, 'token_str': ' computer', 'sequence': 'The invention of the computer revolutionized the way we communicate with each other.'}]
23 | </pre>
24 | 


--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | 


--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/test-electra.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | import transformers
 3 | import deepspeed
 4 | import torch
 5 | import os
 6 | from transformers.models.electra.modeling_electra import ElectraLayer
 7 | from deepspeed.accelerator import get_accelerator
 8 | 
 9 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
10 | world_size = int(os.getenv('WORLD_SIZE', '4'))
11 | 
12 | pipe = pipeline('fill-mask', model="google/electra-base-generator",
13 |     tokenizer="google/electra-base-generator")
14 | 
15 | # The injection_policy shows two things:
16 | #   1. which layer module we need to add Tensor-Parallelism
17 | #   2. the name of one or several linear layers: a) attention_output (both encoder and decoder), 
18 | #       and b) transformer output
19 | pipe.model = deepspeed.init_inference(
20 |     pipe.model,
21 |     mp_size=world_size,
22 |     dtype=torch.float,
23 |     injection_policy={ElectraLayer: ('output.dense')}
24 | )
25 | pipe.device = torch.device(get_accelerator().device_name(local_rank))
26 | output = pipe(f"HuggingFace is creating a {pipe.tokenizer.mask_token} that the community uses to solve NLP tasks.")
27 | 
28 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
29 |     print(output)
30 | 


--------------------------------------------------------------------------------
/inference/huggingface/fill-mask/test-roberta.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | import transformers
 3 | import deepspeed
 4 | import torch
 5 | import os
 6 | from transformers.models.roberta.modeling_roberta import RobertaLayer
 7 | from deepspeed.accelerator import get_accelerator
 8 | 
 9 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
10 | world_size = int(os.getenv('WORLD_SIZE', '4'))
11 | 
12 | pipe = pipeline('fill-mask', model="roberta-large", device=local_rank)
13 | 
14 | # The injection_policy shows two things:
15 | #   1. which layer module we need to add Tensor-Parallelism
16 | #   2. the name of several linear layers: a) attention_output (both encoder and decoder), 
17 | #       and b) transformer output
18 | 
19 | pipe.model = deepspeed.init_inference(
20 |     pipe.model,
21 |     mp_size=world_size,
22 |     dtype=torch.float,
23 |     injection_policy={RobertaLayer: ('output.dense')}
24 | )
25 | 
26 | pipe.device = torch.device(get_accelerator().device_name(local_rank))
27 | output = pipe("The invention of the <mask> revolutionized the way we communicate with each other.")
28 | 
29 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
30 |     print(output)
31 | 


--------------------------------------------------------------------------------
/inference/huggingface/stable-diffusion/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DeepSpeed Stable Diffusion Example
 3 | 
 4 | # Setup
 5 | Python dependencies:
 6 | <pre>
 7 | pip install -r requirements.txt
 8 | </pre>
 9 | 
10 | # Usage
11 | Examples can be run as follows:
12 | <pre>deepspeed --num_gpus [number of GPUs] test-[model].py</pre>
13 | 
14 | NOTE: Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1`.
15 | 
16 | # Example Output
17 | Command:
18 | <pre>
19 | deepspeed --num_gpus 1 test-stable-diffusion.py
20 | </pre>
21 | 
22 | Output:
23 | <pre>
24 | ./baseline.png
25 | ./deepspeed.png
26 | </pre>
27 | 


--------------------------------------------------------------------------------
/inference/huggingface/stable-diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | diffusers>=0.22.3
4 | triton==2.0.0.dev20221202
5 | 


--------------------------------------------------------------------------------
/inference/huggingface/text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.28.1
4 | 


--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DeepSpeed Huggingface Text Generation Script
 3 | 
 4 | # Setup
 5 | Python dependencies:
 6 | <pre>
 7 | pip install -r requirements.txt
 8 | </pre>
 9 | 
10 | # Usage
11 | The [`test-run-generation.py`](./test-run-generation.py) example can be run using [test-gpt.sh](./test-gpt.sh), which serves as an example of how to run the script.
12 | <pre>
13 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
14 |     --model_type=gpt2 \
15 |     --model_name_or_path=gpt2-xl \
16 |     --sample_input single_query.txt \
17 |     --fp16 \
18 |     --ds-inference
19 | </pre>
20 | 
21 | # Example Output
22 | Command:
23 | <pre>
24 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
25 |     --model_type=gpt2 \
26 |     --model_name_or_path=gpt2-xl \
27 |     --sample_input single_query.txt \
28 |     --fp16 \
29 |     --ds-inference
30 | </pre>
31 | 
32 | Output:
33 | <pre>
34 | === GENERATED SEQUENCE 1 ===
35 | What is DeepSpeed?
36 | 
37 | DeepSpeed is a multi-dimensional data compression framework designed to achieve high compression ratio on human readable
38 | </pre>
39 | 


--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | numpy
5 | sentencepiece
6 | protobuf
7 | 


--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/single_query.txt:
--------------------------------------------------------------------------------
1 | What is DeepSpeed?
2 | 


--------------------------------------------------------------------------------
/inference/huggingface/text-generation/run-generation-script/test-gpt.sh:
--------------------------------------------------------------------------------
1 | deepspeed --num_nodes 1 --num_gpus 1 test-run-generation.py \
2 |     --model_type=gpt2 \
3 |     --model_name_or_path=gpt2-xl \
4 |     --sample_input single_query.txt \
5 |     --fp16 \
6 |     --ds-inference
7 | 


--------------------------------------------------------------------------------
/inference/huggingface/text2text-generation/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DeepSpeed Huggingface Text2Text Generation Examples
 3 | 
 4 | # Setup
 5 | Python dependencies:
 6 | <pre>
 7 | pip install -r requirements.txt
 8 | </pre>
 9 | 
10 | # Usage
11 | Examples can be run as follows:
12 | <pre>deepspeed --num_gpus [number of GPUs] test-[model].py</pre>
13 | 
14 | # Example Output
15 | Command:
16 | <pre>
17 | deepspeed --num_gpus 1 test-t5.py
18 | </pre>
19 | 
20 | Output:
21 | <pre>
22 | [{'generated_text': 'd review: this is the best cast iron skillet. Great review! Great review! Great'}]
23 | </pre>
24 | 


--------------------------------------------------------------------------------
/inference/huggingface/text2text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | sentencepiece
5 | google
6 | protobuf
7 | 


--------------------------------------------------------------------------------
/inference/huggingface/text2text-generation/test-t5.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | import transformers
 3 | import deepspeed
 4 | import torch
 5 | import os
 6 | from transformers.models.t5.modeling_t5 import T5Block
 7 | 
 8 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
 9 | world_size = int(os.getenv('WORLD_SIZE', '4'))
10 | 
11 | pipe = pipeline("text2text-generation", model="google/t5-v1_1-small", device=local_rank)
12 | 
13 | # The injection_policy shows two things:
14 | #   1. which layer module we need to add Tensor-Parallelism
15 | #   2. the name of several linear layers: a) attention_output (both encoder and decoder), 
16 | #       and b) transformer output
17 | 
18 | pipe.model = deepspeed.init_inference(
19 |     pipe.model,
20 |     mp_size=world_size,
21 |     dtype=torch.float,
22 |     injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')}
23 | )
24 | 
25 | pipe.device = torch.device(f'cuda:{local_rank}')
26 | output = pipe("Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy")
27 | 
28 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
29 |     print(output)
30 | 


--------------------------------------------------------------------------------
/inference/huggingface/translation/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DeepSpeed Huggingface Translation Examples
 3 | 
 4 | # Setup
 5 | Python dependencies:
 6 | <pre>
 7 | pip install -r requirements.txt
 8 | </pre>
 9 | 
10 | # Usage
11 | Examples can be run as follows:
12 | <pre>deepspeed --num_gpus [number of GPUs] test-[model].py</pre>
13 | 
14 | # Example Output
15 | Command:
16 | <pre>
17 | deepspeed --num_gpus 1 test-t5-base.py
18 | </pre>
19 | 
20 | Output:
21 | <pre>
22 | [{'translation_text': 'Le renard brun rapide saute au-dessus du chien lazy.'}]
23 | </pre>
24 | 


--------------------------------------------------------------------------------
/inference/huggingface/translation/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | torch
3 | transformers==4.21.2
4 | sentencepiece
5 | google
6 | protobuf
7 | 


--------------------------------------------------------------------------------
/inference/huggingface/translation/test-t5-base.py:
--------------------------------------------------------------------------------
 1 | import deepspeed
 2 | import torch
 3 | import os
 4 | from transformers import pipeline
 5 | from transformers.models.t5.modeling_t5 import T5Block
 6 | 
 7 | local_rank = int(os.getenv('LOCAL_RANK', '0'))
 8 | world_size = int(os.getenv('WORLD_SIZE', '4'))
 9 | 
10 | # Init translator
11 | translator = pipeline("translation_en_to_fr", model="t5-base", tokenizer="t5-base", device=local_rank)
12 | 
13 | # DeepSpeed init_inference API
14 | translator.model = deepspeed.init_inference(translator.model,
15 |     mp_size=world_size,
16 |     dtype=torch.float,
17 |     injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')}
18 | )
19 | 
20 | # Translate text
21 | text = "The quick brown fox jumps over the lazy dog."
22 | translation = translator(text)
23 | 
24 | # Print translation
25 | print(translation)
26 | 


--------------------------------------------------------------------------------
/inference/huggingface/zero_inference/images/over_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/inference/huggingface/zero_inference/images/over_v1.png


--------------------------------------------------------------------------------
/inference/huggingface/zero_inference/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed>=0.10.1
2 | torch
3 | transformers @ git+https://github.com/tjruwase/transformers@kvcache-offload-cpu
4 | packaging
5 | accelerate 
6 | 


--------------------------------------------------------------------------------
/inference/huggingface/zero_inference/run_llama2_70b_a6000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | export USE_TF=0 
 3 | BASE_LOG_DIR=~/experiments/zero_inference/
 4 | MODEL_NAME="Llama-2-70b-hf"
 5 | FULL_MODEL_NAME="meta-llama/${MODEL_NAME}"
 6 | QB=4
 7 | 
 8 | BSZ=64
 9 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
10 | mkdir -p  $LOG_DIR
11 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin.txt 
12 | 
13 | BSZ=96
14 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
15 | mkdir -p  $LOG_DIR
16 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin_q${QB}.txt 
17 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv.txt 
18 | 
19 | 
20 | BSZ=200
21 | LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
22 | mkdir -p  $LOG_DIR
23 | deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv_q${QB}.txt
24 | 


--------------------------------------------------------------------------------
/inference/mii/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed MII Examples
2 | 
3 | Install the requirements by running `pip install -r requirements.txt`.
4 | 
5 | Once [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. See the scripts in [non-persistent](./non-persistent/) and [persistent](./persistent/) for examples. Details on the code implemented in these scripts can be found on our [Getting Started guide for MII](https://github.com/deepspeedai/DeepSpeed-mii#getting-started-with-mii).
6 | 


--------------------------------------------------------------------------------
/inference/mii/non-persistent/README.md:
--------------------------------------------------------------------------------
 1 | # Non-Persistent Pipeline Examples
 2 | 
 3 | The `pipeline.py` script can be used to run any of the [supported
 4 | models](https://github.com/deepspeedai/DeepSpeed-mii#supported-models). Provide
 5 | the HuggingFace model name, maximum generated tokens, and prompt(s). The
 6 | generated responses will be printed in the terminal:
 7 | 
 8 | ```shell
 9 | $ python pipeline.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is"
10 | ```
11 | 
12 | Tensor-parallelism can be controlled using the `deepspeed` launcher and setting
13 | `--num_gpus`:
14 | 
15 | ```shell
16 | $ deepspeed --num_gpus 2 pipeline.py
17 | ```
18 | 
19 | ## Model-Specific Examples
20 | 
21 | For convenience, we also provide a set of scripts to quickly test the MII
22 | Pipeline with some popular text-generation models: 
23 | 
24 | | Model | Launch command |
25 | |-------|----------------|
26 | | [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) | `$ python llama2.py` |
27 | | [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) | `$ python falcon.py` |
28 | | [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | `$ deepspeed --num_gpus 2 mixtral.py` |


--------------------------------------------------------------------------------
/inference/mii/non-persistent/falcon.py:
--------------------------------------------------------------------------------
1 | import mii
2 | 
3 | pipe = mii.pipeline("tiiuae/falcon-7b")
4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
5 | if pipe.is_rank_0:
6 |     print(responses[0])
7 | 


--------------------------------------------------------------------------------
/inference/mii/non-persistent/llama2.py:
--------------------------------------------------------------------------------
1 | import mii
2 | 
3 | pipe = mii.pipeline("meta-llama/Llama-2-7b-hf")
4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
5 | if pipe.is_rank_0:
6 |     print(responses[0])
7 | 


--------------------------------------------------------------------------------
/inference/mii/non-persistent/mixtral.py:
--------------------------------------------------------------------------------
1 | import mii
2 | 
3 | pipe = mii.pipeline("mistralai/Mixtral-8x7B-v0.1")
4 | responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
5 | if pipe.is_rank_0:
6 |     print(responses[0])
7 | 


--------------------------------------------------------------------------------
/inference/mii/non-persistent/pipeline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import mii
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
 6 | parser.add_argument(
 7 |     "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"]
 8 | )
 9 | parser.add_argument("--max-new-tokens", type=int, default=128)
10 | args = parser.parse_args()
11 | 
12 | pipe = mii.pipeline(args.model)
13 | responses = pipe(
14 |     args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True
15 | )
16 | 
17 | if pipe.is_rank_0:
18 |     for r in responses:
19 |         print(r, "\n", "-" * 80, "\n")
20 | 


--------------------------------------------------------------------------------
/inference/mii/persistent/README.md:
--------------------------------------------------------------------------------
 1 | # Persistent Deployment Examples
 2 | 
 3 | The `serve.py` script can be used to create an inference server for any of the
 4 | [supported models](https://github.com/deepspeedai/DeepSpeed-mii#supported-models).
 5 | Provide the HuggingFace model name and tensor-parallelism (use the default
 6 | values and run `$ python serve.py` for a single-GPU
 7 | [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 8 | deployment):
 9 | 
10 | ```shell
11 | $ python serve.py --model "mistralai/Mistral-7B-v0.1" tensor-parallel 1
12 | ```
13 | 
14 | Connect to the persistent deployment and generate text with `client.py`. Provide
15 | the HuggingFace model name, maximum generated tokens, and prompt(s) (or if you
16 | are using the default values, run `$ python client.py`):
17 | 
18 | ```shell
19 | $ python client.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is"
20 | ```
21 | 
22 | Shutdown the persistent deployment with `terminate.py`. Provide the HuggingFace
23 | model name (or if you are using the default values, run `$ python
24 | terminate.py`):
25 | 
26 | ```shell
27 | $ python terminate.py --model "mistralai/Mistral-7B-v0.1
28 | ```


--------------------------------------------------------------------------------
/inference/mii/persistent/client.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import mii
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
 6 | parser.add_argument(
 7 |     "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"]
 8 | )
 9 | parser.add_argument("--max-new-tokens", type=int, default=128)
10 | args = parser.parse_args()
11 | 
12 | client = mii.client(args.model)
13 | responses = client(
14 |     args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True
15 | )
16 | 
17 | for r in responses:
18 |     print(r, "\n", "-" * 80, "\n")
19 | 


--------------------------------------------------------------------------------
/inference/mii/persistent/serve.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import mii
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
 6 | parser.add_argument("--tensor-parallel", type=int, default=1)
 7 | args = parser.parse_args()
 8 | 
 9 | mii.serve(args.model, tensor_parallel=args.tensor_parallel)
10 | 
11 | print(f"Serving model {args.model} on {args.tensor_parallel} GPU(s).")
12 | print(f"Run `python client.py --model {args.model}` to connect.")
13 | print(f"Run `python terminate.py --model {args.model}` to terminate.")
14 | 


--------------------------------------------------------------------------------
/inference/mii/persistent/terminate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import mii
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
 6 | args = parser.parse_args()
 7 | 
 8 | client = mii.client(args.model)
 9 | client.terminate_server()
10 | 
11 | print(f"Terminated server for model {args.model}.")
12 | 


--------------------------------------------------------------------------------
/inference/mii/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed-mii>=0.1.3
2 | 


--------------------------------------------------------------------------------
/inference/sglang/README.md:
--------------------------------------------------------------------------------
 1 | # SGLang + ZeRO-Inference Examples
 2 | This folder contains examples of [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) integration into [SGLang](https://github.com/sgl-project/sglang) framework. This integration enable SGLang to inference massive models (e.g., with 100s billion parameters) on a single GPU through the NVMe/CPU offloading optimizations of ZeRO-Inference. 
 3 | 
 4 | ## Prerequisites
 5 | 1. DeepSpeed version >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6)
 6 | 2. SGLang: These examples require our SGLang [fork](https://github.com/tjruwase/sglang/tree/zero-inference). We plan to upstream the SGLang changes to main branch. 
 7 | 
 8 | 
 9 | ## Examples
10 | The examples comprise of the following:
11 | 1. bash scripts that benchmark SGLang throughput in [offline mode](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) with different ZeRO-Inference offloading options. Each script runs a inference on a different model with a prompt of 512 tokens, output of 32 tokens, and batch size of 128. 
12 | 2. DeepSpeed config files corresponding to ZeRO-Inference offloading: (i) CPU offload, (ii) NVMe offload with AIO, and (iii) NVMe offloading with NVIDIA GDS. 


--------------------------------------------------------------------------------
/inference/sglang/ds_offload_cpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |         "stage": 3,
 4 |         "stage3_prefetch_bucket_size": "auto",
 5 |         "stage3_param_persistence_threshold": "auto",
 6 |         "stage3_max_live_parameters": "auto",
 7 |         "offload_param": {
 8 |             "device": "cpu",
 9 |             "pin_memory": true,
10 |             "buffer_size": "auto"
11 |         }
12 |     },
13 |     "train_batch_size": 1
14 | }
15 | 


--------------------------------------------------------------------------------
/inference/sglang/ds_offload_nvme_aio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |         "stage": 3,
 4 |         "stage3_prefetch_bucket_size": "auto",
 5 |         "stage3_param_persistence_threshold": "auto",
 6 |         "stage3_max_live_parameters": "auto",
 7 |         "offload_param": {
 8 |             "device": "nvme",
 9 |             "nvme_path": "/local_nvme/sglang",
10 |             "pin_memory": true,
11 |             "buffer_size": "auto",
12 |             "buffer_count": 5
13 |         }
14 |     },
15 |     "aio": {
16 |         "block_size": 8388608,
17 |         "queue_depth": 32,
18 |         "intra_op_parallelism": 8,
19 |         "single_submit": false,
20 |         "overlap_events": true,
21 |         "use_gds": false
22 |     },
23 |     "train_batch_size": 1
24 | }
25 | 


--------------------------------------------------------------------------------
/inference/sglang/ds_offload_nvme_gds.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |         "stage": 3,
 4 |         "stage3_prefetch_bucket_size": "auto",
 5 |         "stage3_param_persistence_threshold": "auto",
 6 |         "stage3_max_live_parameters": "auto",
 7 |         "offload_param": {
 8 |             "device": "nvme",
 9 |             "nvme_path": "/local_nvme/sglang",
10 |             "pin_memory": true,
11 |             "buffer_size": "auto",
12 |             "buffer_count": 3
13 |         }
14 |     },
15 |     "aio": {
16 |         "block_size": 8388608,
17 |         "queue_depth": 32,
18 |         "intra_op_parallelism": 8,
19 |         "single_submit": false,
20 |         "overlap_events": true,
21 |         "use_gds": true
22 |     },
23 |     "train_batch_size": 1
24 | }
25 | 


--------------------------------------------------------------------------------
/inference/sglang/run_llama3_1B.sh:
--------------------------------------------------------------------------------
 1 | export LOCAL_RANK=0
 2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
 3 | BATCH_SIZE=128
 4 | MODEL_NAME="meta-llama/Llama-3.2-1B"
 5 | 
 6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
 7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
 8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
 9 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/inference/sglang/run_llama3_70B.sh:
--------------------------------------------------------------------------------
 1 | export LOCAL_RANK=0
 2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
 3 | BATCH_SIZE=128
 4 | MODEL_NAME="meta-llama/Meta-Llama-3.1-70B"
 5 | 
 6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
 7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
 8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
 9 | # python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
10 | 


--------------------------------------------------------------------------------
/inference/sglang/run_llama3_8B.sh:
--------------------------------------------------------------------------------
 1 | export LOCAL_RANK=0
 2 | DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
 3 | BATCH_SIZE=128
 4 | MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct"
 5 | 
 6 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
 7 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
 8 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
 9 | python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
10 | 


--------------------------------------------------------------------------------
/scripts/check-license.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | from __future__ import annotations
 8 | """
 9 | Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
10 | """
11 | 
12 | import subprocess
13 | import sys
14 | 
15 | 
16 | def err(s: str) -> None:
17 |     print(s, file=sys.stderr)
18 | 
19 | 
20 | COPYRIGHT = [
21 |     r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$",
22 |     r"^\(\/\/\|#\) DeepSpeed Team$"
23 | ]
24 | 
25 | success = True
26 | failures = []
27 | for f in sys.argv[1:]:
28 |     for copyright_line in COPYRIGHT:
29 |         if not success:
30 |             continue
31 |         res = subprocess.run(["git", "grep", "--quiet", "-e", copyright_line, f], capture_output=True)
32 |         if res.returncode == 1:
33 |             success = False
34 |             failures.append(f)
35 |         elif res.returncode == 2:
36 |             err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
37 |             err(res.stderr.decode("utf-8"))
38 |             sys.exit(2)
39 | 
40 | if not success:
41 |     err(f'{failures}: Missing license at top of file')
42 |     err(res.stdout.decode("utf-8"))
43 |     sys.exit(1)
44 | 


--------------------------------------------------------------------------------
/training/BingBertGlue/glue_bert_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 32,
 3 |     "train_micro_batch_size_per_gpu": 1,
 4 |     "steps_per_print": 10,
 5 |     "optimizer": {
 6 |       "type": "Adam",
 7 |       "params": {
 8 |         "lr": 2e-5,
 9 |         "weight_decay": 0.0,
10 |         "bias_correction": true
11 |       }
12 |     },
13 |     "gradient_clipping": 1.0,
14 |     "fp16": {
15 |       "enabled": false
16 |     }
17 |   
18 |   }
19 |   


--------------------------------------------------------------------------------
/training/BingBertGlue/glue_bert_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 32,
 3 |     "train_micro_batch_size_per_gpu": 1,
 4 |     "steps_per_print": 10,
 5 |     "optimizer": {
 6 |       "type": "Adam",
 7 |       "params": {
 8 |         "lr": 2e-5,
 9 |         "weight_decay": 0.0,
10 |         "bias_correction": true
11 |       }
12 |     },
13 |     "gradient_clipping": 1.0,
14 |     "fp16": {
15 |       "enabled": false
16 |     }
17 |   
18 |   }
19 |   


--------------------------------------------------------------------------------
/training/BingBertGlue/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 |                        BertForMaskedLM, BertForNextSentencePrediction,
5 |                        BertForSequenceClassification, BertForMultipleChoice,
6 |                        BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 | 


--------------------------------------------------------------------------------
/training/BingBertGlue/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | def main():
 3 |     import sys
 4 |     try:
 5 |         from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 6 |     except ModuleNotFoundError:
 7 |         print(
 8 |             "pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
 9 |             "In that case, it requires TensorFlow to be installed. Please see "
10 |             "https://www.tensorflow.org/install/ for installation instructions."
11 |         )
12 |         raise
13 | 
14 |     if len(sys.argv) != 5:
15 |         # pylint: disable=line-too-long
16 |         print(
17 |             "Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`"
18 |         )
19 |     else:
20 |         PYTORCH_DUMP_OUTPUT = sys.argv.pop()
21 |         TF_CONFIG = sys.argv.pop()
22 |         TF_CHECKPOINT = sys.argv.pop()
23 |         convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG,
24 |                                          PYTORCH_DUMP_OUTPUT)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/training/BingBertGlue/turing/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch.distributed as dist
 3 | 
 4 | logging.basicConfig(
 5 |     format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 6 |     datefmt='%m/%d/%Y %H:%M:%S',
 7 |     level=logging.INFO)
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class Logger():
12 |     def __init__(self, cuda=False):
13 |         self.logger = logging.getLogger(__name__)
14 |         self.cuda = cuda
15 | 
16 |     def info(self, message, *args, **kwargs):
17 |         if (self.cuda and dist.get_rank() == 0) or not self.cuda:
18 |             self.logger.info(message, *args, **kwargs)
19 | 
20 |     def error(self, message, *args, **kwargs):
21 |         self.logger.error(message, *args, **kwargs)
22 | 


--------------------------------------------------------------------------------
/training/BingBertGlue/turing/text.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | PAD = 0
 4 | 
 5 | 
 6 | def mask(x):
 7 |     return x != PAD
 8 | 
 9 | 
10 | def torch_long(x):
11 |     return torch.LongTensor(x)
12 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 96,
 3 |   "train_micro_batch_size_per_gpu": 3,
 4 |   "steps_per_print": 100,
 5 |   "optimizer": {
 6 |     "type": "OnebitAdam",
 7 |     "params": {
 8 |       "lr": 3e-5,
 9 |       "freeze_step": 400,
10 |       "weight_decay": 0.0,
11 |       "bias_correction": false,
12 |       "cuda_aware": false,
13 |       "comm_backend_name": "mpi"
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 96,
 3 |   "train_micro_batch_size_per_gpu": 3,
 4 |   "steps_per_print": 100,
 5 |   "optimizer": {
 6 |     "type": "OnebitAdam",
 7 |     "params": {
 8 |       "lr": 3e-5,
 9 |       "freeze_step": 400,
10 |       "weight_decay": 0.0,
11 |       "bias_correction": false,
12 |       "cuda_aware": true,
13 |       "comm_backend_name": "mpi"
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 96,
 3 |   "train_micro_batch_size_per_gpu": 3,
 4 |   "steps_per_print": 100,
 5 |   "optimizer": {
 6 |     "type": "OnebitAdam",
 7 |     "params": {
 8 |       "lr": 3e-5,
 9 |       "freeze_step": 400,
10 |       "weight_decay": 0.0,
11 |       "bias_correction": false,
12 |       "cuda_aware": false,
13 |       "comm_backend_name": "nccl"
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 1024,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 4096,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 16,
15 |   "num_hidden_layers": 24,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30522
19 | }
20 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/deepspeed_bsz24_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 24,
 3 |   "train_micro_batch_size_per_gpu": 3,
 4 |   "steps_per_print": 10,
 5 |   "optimizer": {
 6 |     "type": "Adam",
 7 |     "params": {
 8 |       "lr": 3e-5,
 9 |       "weight_decay": 0.0,
10 |       "bias_correction": false
11 |     }
12 |   },
13 |   "gradient_clipping": 1.0,
14 |   "fp16": {
15 |     "enabled": true
16 |   }
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/evaluate-v1.1.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import evaluate as eval
 4 | 
 5 | if __name__ == '__main__':
 6 |     expected_version = '1.1'
 7 |     parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' +
 8 |                                      expected_version)
 9 |     parser.add_argument('dataset_file', help='Dataset file')
10 |     parser.add_argument('prediction_file', help='Prediction File')
11 |     args = parser.parse_args()
12 | 
13 |     print(
14 |         json.dumps(
15 |             eval.evaluate(expected_version, args.dataset_file,
16 |                           args.prediction_file)))
17 | 


--------------------------------------------------------------------------------
/training/BingBertSquad/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 |                        BertForMaskedLM, BertForNextSentencePrediction,
5 |                        BertForSequenceClassification, BertForMultipleChoice,
6 |                        BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 | 


--------------------------------------------------------------------------------
/training/DeepSpeed-Domino/requirements.txt:
--------------------------------------------------------------------------------
1 | apex
2 | deepspeed>=0.16.6
3 | nltk
4 | pybind11
5 | transformers
6 | regex
7 | 


--------------------------------------------------------------------------------
/training/HelloDeepSpeed/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==1.13.3
2 | transformers==4.5.1
3 | fire==0.4.0
4 | pytz==2021.1
5 | loguru==0.5.3
6 | sh==1.14.2
7 | pytest==6.2.5
8 | tqdm==4.62.3


--------------------------------------------------------------------------------
/training/HelloDeepSpeed/run.sh:
--------------------------------------------------------------------------------
1 | python train_bert.py --checkpoint_dir ./experiment
2 | 


--------------------------------------------------------------------------------
/training/HelloDeepSpeed/run_ds.sh:
--------------------------------------------------------------------------------
1 | deepspeed --bind_cores_to_rank train_bert_ds.py --checkpoint_dir experiment_deepspeed $@
2 | 


--------------------------------------------------------------------------------
/training/HelloDeepSpeed/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/HelloDeepSpeed/tests/__init__.py


--------------------------------------------------------------------------------
/training/MoQ/README.md:
--------------------------------------------------------------------------------
1 | # Not maintained / deprecated
2 | 
3 | > __Warning__
4 | > This folder/feature has been deprecated. Feel free to test and submit an issue if you run into errors.
5 | 
6 | 


--------------------------------------------------------------------------------
/training/MoQ/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | 


--------------------------------------------------------------------------------
/training/MoQ/run.sh:
--------------------------------------------------------------------------------
 1 | OOO=output
 2 | MASTER_PORT=12345
 3 | GPU=0
 4 | 
 5 | for TSK in qnli #stsb mrpc cola wnli sst2 rte qnli qqp mnli
 6 | do
 7 | 
 8 | if [ $TSK == wnli ] || [ $TSK == mrpc ]
 9 | then
10 |     EPOCH_NUM=5
11 | else
12 |     EPOCH_NUM=3
13 | fi
14 | 
15 | if [ $TSK == qqp ] || [ $TSK == mnli ]
16 | then
17 |     TEST_JSON=test_long.json
18 | else
19 |     TEST_JSON=test.json
20 | fi
21 | 
22 | PORT=$((MASTER_PORT+GPU))
23 | 
24 | rm -rvf ./$OOO/${TSK}
25 | 
26 | CUDA_VISIBLE_DEVICES=$GPU python -m torch.distributed.launch \
27 |   --master_port $PORT \
28 |   --nproc_per_node 1 run_glue.py \
29 |   --model_name_or_path bert-base-cased \
30 |   --task_name $TSK \
31 |   --do_train \
32 |   --do_eval \
33 |   --max_seq_length 128 \
34 |   --per_device_train_batch_size 32 \
35 |   --learning_rate 2e-5 \
36 |   --num_train_epochs $EPOCH_NUM \
37 |   --output_dir ./$OOO/$TSK/ \
38 |   --fp16 \
39 |   --warmup_steps 2 \
40 |   --deepspeed test.json
41 | 
42 | done
43 | 


--------------------------------------------------------------------------------
/training/MoQ/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "steps_per_print": 10,
 3 |     "gradient_clipping": 1.0,
 4 |     "fp16": {
 5 |       "initial_scale_power": 16,
 6 |       "enabled": true
 7 |     },
 8 |     "quantize_training": {
 9 |       "enabled": true,
10 |       "quantize_verbose": true,
11 |       "quantizer_kernel": true,
12 |       "quantize_algo": {
13 |         "q_type": "symmetric"
14 |       },
15 |       "quantize_bits": {
16 |         "start_bits": 16,
17 |         "target_bits": 8
18 |       },
19 |       "quantize_schedule": {
20 |         "quantize_period": 400,
21 |         "schedule_offset": 0
22 |       },
23 |       "quantize_groups": 8
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/training/autotuning/.gitignore:
--------------------------------------------------------------------------------
1 | autotuning_results*
2 | autotuning_exps*
3 | output*
4 | mnli
5 | 


--------------------------------------------------------------------------------
/training/autotuning/README.md:
--------------------------------------------------------------------------------
1 | # Autotuning Examples
2 | 
3 | This showcases the [autotuning](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS).
4 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/bert-base/ds_config_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "autotuning": {
 4 |     "enabled": true,
 5 |     "overwrite": false,
 6 |     "max_train_batch_size": 4096,
 7 |     "arg_mappings": {
 8 |       "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
 9 |       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/bert-large/ds_config_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "autotuning": {
 4 |     "enabled": true,
 5 |     "overwrite": false,
 6 |     "arg_mappings": {
 7 |       "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
 8 |       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
 9 |     }
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/deberta/ds_config_fp16_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "fp16": {
 4 |     "enabled": true,
 5 |     "initial_scale_power": 12
 6 |   },
 7 |   "autotuning": {
 8 |     "enabled": true,
 9 |     "overwrite": false,
10 |     "fast": true,
11 |     "arg_mappings": {
12 |       "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
13 |       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/training/autotuning/hf/distilbert/ds_config_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "autotuning": {
 4 |     "enabled": true,
 5 |     "overwrite": false,
 6 |     "max_train_batch_size": 4096,
 7 |     "arg_mappings": {
 8 |       "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
 9 |       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "fp16": {
 4 |     "enabled": true
 5 |   },
 6 |   "autotuning": {
 7 |     "enabled": true,
 8 |     "overwrite": false,
 9 |     "fast": true,
10 |     "arg_mappings": {
11 |       "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
12 |       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
13 |     }
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "zero_optimization": {
 4 |     "stage": 0
 5 |   },
 6 |   "fp16": {
 7 |     "enabled": true
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "zero_optimization": {
 4 |     "stage": 1
 5 |   },
 6 |   "fp16": {
 7 |     "enabled": true
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "zero_optimization": {
 4 |     "stage": 2
 5 |   },
 6 |   "fp16": {
 7 |     "enabled": true
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_fp16_z3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "zero_optimization": {
 4 |     "stage": 3
 5 |   },
 6 |   "fp16": {
 7 |     "enabled": true
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "autotuning": {
 4 |     "enabled": true,
 5 |     "overwrite": false,
 6 |     "fast": true,
 7 |     "arg_mappings": {
 8 |       "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
 9 |       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z0.json:
--------------------------------------------------------------------------------
1 | {
2 |   "train_micro_batch_size_per_gpu": "auto",
3 |   "zero_optimization": {
4 |     "stage": 0
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z1.json:
--------------------------------------------------------------------------------
1 | {
2 |   "train_micro_batch_size_per_gpu": "auto",
3 |   "zero_optimization": {
4 |     "stage": 1
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z2.json:
--------------------------------------------------------------------------------
1 | {
2 |   "train_micro_batch_size_per_gpu": "auto",
3 |   "zero_optimization": {
4 |     "stage": 2
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/training/autotuning/hf/dsconfigs/ds_config_z3.json:
--------------------------------------------------------------------------------
1 | {
2 |   "train_micro_batch_size_per_gpu": "auto",
3 |   "zero_optimization": {
4 |     "stage": 3
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_ethernet/deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "ZeroOneAdam",
 8 |     "params": {
 9 |       "lr": 4e-4,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "var_freeze_step": 12500,
13 |       "local_step_scaler": 32678,
14 |       "cuda_aware": false,
15 |       "comm_backend_name": "nccl"
16 |     }
17 |   },
18 |   "gradient_clipping": 1.0,
19 | 
20 |   "wall_clock_breakdown": false,
21 | 
22 |   "fp16": {
23 |     "enabled": true,
24 |     "loss_scale": 0,
25 |     "initial_scale_power": 16
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_ethernet/deepspeed_bsz4k_01adam_config_seq512_mpi_ethernet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 4096,
 3 |     "train_micro_batch_size_per_gpu": 16,
 4 |     "steps_per_print": 100,
 5 |     "prescale_gradients": false,
 6 |     "optimizer": {
 7 |       "type": "ZeroOneAdam",
 8 |       "params": {
 9 |         "lr": 2.82e-5,
10 |         "weight_decay": 0.01,
11 |         "bias_correction": false,
12 |         "var_freeze_step": 155000,
13 |         "local_step_scaler": 32678,
14 |         "cuda_aware": false,
15 |         "comm_backend_name": "nccl"
16 |       }
17 |     },
18 |     "gradient_clipping": 1.0,
19 |   
20 |     "wall_clock_breakdown": false,
21 |   
22 |     "fp16": {
23 |       "enabled": true,
24 |       "loss_scale": 0,
25 |       "initial_scale_power": 16
26 |     }
27 |   }


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_ethernet/ds_train_bert_01adam_bsz4k_seq128_mpi_ethernet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script requires pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
 5 | # Read the tutorial for more details:
 6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/
 7 | 
 8 | base_dir=`pwd`
 9 | 
10 | JOB_NAME=01adam_bsz4k_seq128_mpi_ethernet
11 | OUTPUT_DIR=${base_dir}/bert_model_outputs
12 | 
13 | mkdir -p $OUTPUT_DIR
14 | 
15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
16 | run_cmd="NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed --launcher=openmpi \
17 |     ${base_dir}/../../deepspeed_train.py \
18 |     --cf ${base_dir}/../../bert_large.json \
19 |     --max_seq_length 128 \
20 |     --output_dir $OUTPUT_DIR \
21 |     --deepspeed \
22 |     --print_steps 40 \
23 |     --lr_schedule "LE" \
24 |     --lr_offset 0.0 \
25 |     --job_name $JOB_NAME \
26 |     --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_mpi_ethernet.json \
27 |     --data_path_prefix /data/bert \
28 |     &> ${JOB_NAME}.log"
29 | 
30 | echo ${run_cmd}
31 | eval ${run_cmd}


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_infiniband/deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "ZeroOneAdam",
 8 |     "params": {
 9 |       "lr": 4e-4,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "var_freeze_step": 12500,
13 |       "local_step_scaler": 32678,
14 |       "cuda_aware": false,
15 |       "comm_backend_name": "nccl"
16 |     }
17 |   },
18 |   "gradient_clipping": 1.0,
19 | 
20 |   "wall_clock_breakdown": false,
21 | 
22 |   "fp16": {
23 |     "enabled": true,
24 |     "loss_scale": 0,
25 |     "initial_scale_power": 16
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_infiniband/deepspeed_bsz4k_01adam_config_seq512_mpi_infiniband.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 4096,
 3 |     "train_micro_batch_size_per_gpu": 16,
 4 |     "steps_per_print": 100,
 5 |     "prescale_gradients": false,
 6 |     "optimizer": {
 7 |       "type": "ZeroOneAdam",
 8 |       "params": {
 9 |         "lr": 2.82e-5,
10 |         "weight_decay": 0.01,
11 |         "bias_correction": false,
12 |         "var_freeze_step": 155000,
13 |         "local_step_scaler": 32678,
14 |         "cuda_aware": false,
15 |         "comm_backend_name": "nccl"
16 |       }
17 |     },
18 |     "gradient_clipping": 1.0,
19 |   
20 |     "wall_clock_breakdown": false,
21 |   
22 |     "fp16": {
23 |       "enabled": true,
24 |       "loss_scale": 0,
25 |       "initial_scale_power": 16
26 |     }
27 |   }


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/mpi_infiniband/ds_train_bert_01adam_bsz4k_seq128_mpi_infiniband.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script requires pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
 5 | # Read the tutorial for more details:
 6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/
 7 | 
 8 | base_dir=`pwd`
 9 | 
10 | JOB_NAME=01adam_bsz4k_seq128_mpi_infiniband
11 | OUTPUT_DIR=${base_dir}/bert_model_outputs
12 | 
13 | mkdir -p $OUTPUT_DIR
14 | 
15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
16 | run_cmd="NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \
17 |     --cf ${base_dir}/../../bert_large.json \
18 |     --max_seq_length 128 \
19 |     --output_dir $OUTPUT_DIR \
20 |     --deepspeed \
21 |     --print_steps 40 \
22 |     --lr_schedule "LE" \
23 |     --lr_offset 0.0 \
24 |     --job_name $JOB_NAME \
25 |     --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_mpi_infiniband.json \
26 |     --data_path_prefix /data/bert \
27 |     &> ${JOB_NAME}.log"
28 | 
29 | echo ${run_cmd}
30 | eval ${run_cmd}


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/nccl/deepspeed_bsz4k_01adam_config_seq128_nccl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "ZeroOneAdam",
 8 |     "params": {
 9 |       "lr": 4e-4,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "var_freeze_step": 12500,
13 |       "local_step_scaler": 32678,
14 |       "cuda_aware": false,
15 |       "comm_backend_name": "nccl"
16 |     }
17 |   },
18 |   "gradient_clipping": 1.0,
19 | 
20 |   "wall_clock_breakdown": false,
21 | 
22 |   "fp16": {
23 |     "enabled": true,
24 |     "loss_scale": 0,
25 |     "initial_scale_power": 16
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/nccl/deepspeed_bsz4k_01adam_config_seq512_nccl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 4096,
 3 |     "train_micro_batch_size_per_gpu": 16,
 4 |     "steps_per_print": 100,
 5 |     "prescale_gradients": false,
 6 |     "optimizer": {
 7 |       "type": "ZeroOneAdam",
 8 |       "params": {
 9 |         "lr": 2.82e-5,
10 |         "weight_decay": 0.01,
11 |         "bias_correction": false,
12 |         "var_freeze_step": 155000,
13 |         "local_step_scaler": 32678,
14 |         "cuda_aware": false,
15 |         "comm_backend_name": "nccl"
16 |       }
17 |     },
18 |     "gradient_clipping": 1.0,
19 |   
20 |     "wall_clock_breakdown": false,
21 |   
22 |     "fp16": {
23 |       "enabled": true,
24 |       "loss_scale": 0,
25 |       "initial_scale_power": 16
26 |     }
27 |   }


--------------------------------------------------------------------------------
/training/bing_bert/01_adam/nccl/ds_train_bert_01adam_bsz4k_seq128_nccl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script requires pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
 5 | # Read the tutorial for more details:
 6 | # https://www.deepspeed.ai/tutorials/zero-one-adam/
 7 | 
 8 | base_dir=`pwd`
 9 | 
10 | JOB_NAME=01adam_bsz4k_seq128_nccl
11 | OUTPUT_DIR=${base_dir}/bert_model_outputs
12 | 
13 | mkdir -p $OUTPUT_DIR
14 | 
15 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
16 | run_cmd="NCCL_TREE_THRESHOLD=0 NCCL_DEBUG=INFO \
17 |     deepspeed \
18 |     ${base_dir}/../../deepspeed_train.py \
19 |     --cf ${base_dir}/../../bert_large.json \
20 |     --max_seq_length 128 \
21 |     --output_dir $OUTPUT_DIR \
22 |     --deepspeed \
23 |     --print_steps 40 \
24 |     --lr_schedule "LE" \
25 |     --lr_offset 0.0 \
26 |     --job_name $JOB_NAME \
27 |     --deepspeed_config ${base_dir}/deepspeed_bsz4k_01adam_config_seq128_nccl.json \
28 |     --data_path_prefix /data/bert \
29 |     &> ${JOB_NAME}.log"
30 | 
31 | echo ${run_cmd}
32 | eval ${run_cmd}


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/mpi_ethernet/deepspeed_bsz4k_onebitadam_config_seq128_mpi_ethernet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitAdam",
 8 |     "params": {
 9 |       "lr": 4e-4,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "freeze_step": 23000,
13 |       "cuda_aware": false,
14 |       "comm_backend_name": "mpi"
15 |     }
16 |   },
17 |   "gradient_clipping": 1.0,
18 | 
19 |   "wall_clock_breakdown": false,
20 | 
21 |   "fp16": {
22 |     "enabled": true,
23 |     "loss_scale": 0,
24 |     "initial_scale_power": 16
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/mpi_infiniband/deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitAdam",
 8 |     "params": {
 9 |       "lr": 4e-4,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "freeze_step": 23000,
13 |       "cuda_aware": true,
14 |       "comm_backend_name": "mpi"
15 |     }
16 |   },
17 |   "gradient_clipping": 1.0,
18 | 
19 |   "wall_clock_breakdown": false,
20 | 
21 |   "fp16": {
22 |     "enabled": true,
23 |     "loss_scale": 0,
24 |     "initial_scale_power": 16
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/mpi_infiniband/ds_train_bert_onebitadam_bsz4k_seq128_mpi_infiniband.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you are able to install pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs),
 5 | # we highly recommend you to use the NCCL-based 1-bit Adam
 6 | # which has better performance and ease of use
 7 | # (see scripts in DeepSpeedExamples/bing_bert/1-bit_adam/nccl
 8 | # and read the tutorial for more details:
 9 | # https://www.deepspeed.ai/tutorials/onebit-adam/)
10 | 
11 | base_dir=`pwd`
12 | 
13 | # Where should we save checkpoints and tensorboard events?
14 | JOB_NAME=onebit_adam_4k_seq128_mpi_infiniband
15 | OUTPUT_DIR=${base_dir}/bert_model_outputs
16 | 
17 | mkdir -p $OUTPUT_DIR
18 | 
19 | NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \
20 | --cf ${base_dir}/../../bert_large.json \
21 | --max_seq_length 128 \
22 | --output_dir $OUTPUT_DIR \
23 | --deepspeed_mpi \
24 | --deepspeed \
25 | --deepspeed_transformer_kernel \
26 | --print_steps 40 \
27 | --lr_schedule "LE" \
28 | --lr_offset 0.0 \
29 | --job_name $JOB_NAME \
30 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_onebitadam_config_seq128_mpi_infiniband.json \
31 | --data_path_prefix /data/bert \
32 | &> ${JOB_NAME}.log
33 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/nccl/deepspeed_bsz4k_onebitadam_config_seq128_nccl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitAdam",
 8 |     "params": {
 9 |       "lr": 4e-4,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "freeze_step": 23000,
13 |       "cuda_aware": false,
14 |       "comm_backend_name": "nccl"
15 |     }
16 |   },
17 |   "gradient_clipping": 1.0,
18 | 
19 |   "wall_clock_breakdown": false,
20 | 
21 |   "fp16": {
22 |     "enabled": true,
23 |     "loss_scale": 0,
24 |     "initial_scale_power": 16
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_adam/nccl/ds_train_bert_onebitadam_bsz4k_seq128_nccl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script requires pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
 5 | # Read the tutorial for more details:
 6 | # https://www.deepspeed.ai/tutorials/onebit-adam/
 7 | 
 8 | base_dir=`pwd`
 9 | 
10 | # Where should we save checkpoints and tensorboard events?
11 | JOB_NAME=onebit_adam_4k_seq128_nccl
12 | OUTPUT_DIR=${base_dir}/bert_model_outputs
13 | 
14 | mkdir -p $OUTPUT_DIR
15 | 
16 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
17 | NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ${base_dir}/../../deepspeed_train.py \
18 | --cf ${base_dir}/../../bert_large.json \
19 | --max_seq_length 128 \
20 | --output_dir $OUTPUT_DIR \
21 | --deepspeed \
22 | --deepspeed_transformer_kernel \
23 | --print_steps 40 \
24 | --lr_schedule "LE" \
25 | --lr_offset 0.0 \
26 | --job_name $JOB_NAME \
27 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_onebitadam_config_seq128_nccl.json \
28 | --data_path_prefix /data/bert \
29 | &> ${JOB_NAME}.log
30 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_ethernet/deepspeed_bsz32k_onebitlamb_config_seq512_mpi_ethernet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 32768,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitLamb",
 8 |     "params": {
 9 |       "lr": 2e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01,
14 |       "freeze_step": 6100,
15 |       "cuda_aware": false,
16 |       "comm_backend_name": "mpi",
17 |       "coeff_beta": 0.9,
18 |       "factor_max": 4.0,
19 |       "factor_min": 0.5,
20 |       "factor_threshold": 0.1
21 |     }
22 |   },
23 |   "gradient_clipping": 1.0,
24 | 
25 |   "wall_clock_breakdown": false,
26 | 
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_ethernet/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_ethernet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 65536,
 3 |   "train_micro_batch_size_per_gpu": 64,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitLamb",
 8 |     "params": {
 9 |       "lr": 11e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01,
14 |       "freeze_step": 1000,
15 |       "cuda_aware": false,
16 |       "comm_backend_name": "mpi",
17 |       "coeff_beta": 0.9,
18 |       "factor_max": 4.0,
19 |       "factor_min": 0.5,
20 |       "factor_threshold": 0.1
21 |     }
22 |   },
23 |   "gradient_clipping": 1.0,
24 | 
25 |   "wall_clock_breakdown": false,
26 | 
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0,
30 |     "initial_scale_power": 16
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_infiniband/deepspeed_bsz32k_onebitlamb_config_seq512_mpi_infiniband.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 32768,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitLamb",
 8 |     "params": {
 9 |       "lr": 2e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01,
14 |       "freeze_step": 6100,
15 |       "cuda_aware": true,
16 |       "comm_backend_name": "mpi",
17 |       "coeff_beta": 0.9,
18 |       "factor_max": 4.0,
19 |       "factor_min": 0.5,
20 |       "factor_threshold": 0.1
21 |     }
22 |   },
23 |   "gradient_clipping": 1.0,
24 | 
25 |   "wall_clock_breakdown": false,
26 | 
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_infiniband/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 65536,
 3 |   "train_micro_batch_size_per_gpu": 64,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitLamb",
 8 |     "params": {
 9 |       "lr": 11e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01,
14 |       "freeze_step": 1000,
15 |       "cuda_aware": true,
16 |       "comm_backend_name": "mpi",
17 |       "coeff_beta": 0.9,
18 |       "factor_max": 4.0,
19 |       "factor_min": 0.5,
20 |       "factor_threshold": 0.1
21 |     }
22 |   },
23 |   "gradient_clipping": 1.0,
24 | 
25 |   "wall_clock_breakdown": false,
26 | 
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0,
30 |     "initial_scale_power": 16
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/mpi_infiniband/ds_train_bert_onebitlamb_bsz64k_seq128_mpi_infiniband.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you are able to install pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs),
 5 | # we highly recommend you to use the NCCL-based 1-bit Lamb
 6 | # which has better performance and ease of use
 7 | # (see scripts in DeepSpeedExamples/bing_bert/1-bit_lamb/nccl
 8 | # and read the tutorial for more details:
 9 | # https://www.deepspeed.ai/tutorials/onebit-lamb/)
10 | 
11 | base_dir=`pwd`
12 | 
13 | # Where should we save checkpoints and tensorboard events?
14 | JOB_NAME=onebit_lamb_64k_seq128_mpi_infiniband
15 | OUTPUT_DIR=${base_dir}/bert_model_outputs
16 | 
17 | mkdir -p $OUTPUT_DIR
18 | 
19 | NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ${base_dir}/../../deepspeed_train.py \
20 | --cf ${base_dir}/../../bert_large_lamb.json \
21 | --max_seq_length 128 \
22 | --output_dir $OUTPUT_DIR \
23 | --deepspeed_mpi \
24 | --deepspeed \
25 | --deepspeed_transformer_kernel \
26 | --print_steps 40 \
27 | --lr_schedule "EE" \
28 | --lr_offset 10e-4 \
29 | --job_name $JOB_NAME \
30 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_onebitlamb_config_seq128_mpi_infiniband.json \
31 | --data_path_prefix /data/bert \
32 | &> ${JOB_NAME}.log
33 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/nccl/deepspeed_bsz32k_onebitlamb_config_seq512_nccl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 32768,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitLamb",
 8 |     "params": {
 9 |       "lr": 2e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01,
14 |       "freeze_step": 6100,
15 |       "cuda_aware": false,
16 |       "comm_backend_name": "nccl",
17 |       "coeff_beta": 0.9,
18 |       "factor_max": 4.0,
19 |       "factor_min": 0.5,
20 |       "factor_threshold": 0.1
21 |     }
22 |   },
23 |   "gradient_clipping": 1.0,
24 | 
25 |   "wall_clock_breakdown": false,
26 | 
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/nccl/deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 65536,
 3 |   "train_micro_batch_size_per_gpu": 64,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "OneBitLamb",
 8 |     "params": {
 9 |       "lr": 11e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01,
14 |       "freeze_step": 1000,
15 |       "cuda_aware": false,
16 |       "comm_backend_name": "nccl",
17 |       "coeff_beta": 0.9,
18 |       "factor_max": 4.0,
19 |       "factor_min": 0.5,
20 |       "factor_threshold": 0.1
21 |     }
22 |   },
23 |   "gradient_clipping": 1.0,
24 | 
25 |   "wall_clock_breakdown": false,
26 | 
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0,
30 |     "initial_scale_power": 16
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/training/bing_bert/1-bit_lamb/nccl/ds_train_bert_onebitlamb_bsz64k_seq128_nccl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script requires pytorch >= 1.8
 4 | # (and nccl >= 2.8.3 if you have 64 or more GPUs).
 5 | # Read the tutorial for more details:
 6 | # https://www.deepspeed.ai/tutorials/onebit-lamb
 7 | 
 8 | base_dir=`pwd`
 9 | 
10 | # Where should we save checkpoints and tensorboard events?
11 | JOB_NAME=onebit_lamb_64k_seq128_nccl
12 | OUTPUT_DIR=${base_dir}/bert_model_outputs
13 | 
14 | mkdir -p $OUTPUT_DIR
15 | 
16 | # NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
17 | NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ${base_dir}/../../deepspeed_train.py \
18 | --cf ${base_dir}/../../bert_large_lamb.json \
19 | --max_seq_length 128 \
20 | --output_dir $OUTPUT_DIR \
21 | --deepspeed \
22 | --deepspeed_transformer_kernel \
23 | --print_steps 40 \
24 | --lr_schedule "EE" \
25 | --lr_offset 10e-4 \
26 | --job_name $JOB_NAME \
27 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_onebitlamb_config_seq128_nccl.json \
28 | --data_path_prefix /data/bert \
29 | --ckpt_to_save 150 \
30 | &> ${JOB_NAME}.log
31 | 


--------------------------------------------------------------------------------
/training/bing_bert/bert_dataset_provider.py:
--------------------------------------------------------------------------------
 1 | class BertDatasetProviderInterface:
 2 |     def get_shard(self, index, shuffle=True):
 3 |         raise NotImplementedError
 4 | 
 5 |     def release_shard(self, index):
 6 |         raise NotImplementedError
 7 | 
 8 |     def prefetch_shard(self, index):
 9 |         raise NotImplementedError
10 | 
11 |     def get_batch(self, batch_iter):
12 |         raise NotImplementedError
13 | 
14 |     def prefetch_batch(self):
15 |         raise NotImplementedError
16 | 


--------------------------------------------------------------------------------
/training/bing_bert/data_worker.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import queue
 3 | import time
 4 | 
 5 | 
 6 | class AsyncWorker(threading.Thread):
 7 |     def __init__(self, dataloaders, dataset_picker):
 8 |         threading.Thread.__init__(self)
 9 |         self.req_queue = queue.Queue()
10 |         self.ret_queue = queue.Queue()
11 |         self.dataloaders = dataloaders
12 |         self.dataset_picker = dataset_picker
13 |         self.prefetch_idx = 3
14 |         for i in range(self.prefetch_idx):
15 |             self.req_queue.put(dataset_picker[i])
16 | 
17 |     def run(self):
18 |         while True:
19 |             dataset_type = self.req_queue.get(block=True)
20 |             if dataset_type is None:
21 |                 break
22 |             batch = next(self.dataloaders[dataset_type])
23 |             self.req_queue.task_done()
24 |             self.ret_queue.put(batch)
25 | 
26 |     def get(self):
27 |         batch = self.ret_queue.get()
28 |         self.ret_queue.task_done()
29 |         return batch
30 | 
31 |     def prefetch(self):
32 |         if self.prefetch_idx < len(self.dataset_picker):
33 |             self.req_queue.put(self.dataset_picker[self.prefetch_idx])
34 |             self.prefetch_idx += 1
35 | 
36 |     def stop(self):
37 |         self.req_queue.put(None)
38 | 


--------------------------------------------------------------------------------
/training/bing_bert/deepspeed_bsz32k_lamb_config_seq512.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 32768,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "Lamb",
 8 |     "params": {
 9 |       "lr": 2e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 | 
18 |   "wall_clock_breakdown": false,
19 | 
20 |   "fp16": {
21 |     "enabled": true,
22 |     "loss_scale": 0
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/training/bing_bert/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 4096,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": true,
 6 |   "gradient_predivide_factor": 8,
 7 |   "optimizer": {
 8 |     "type": "Adam",
 9 |     "params": {
10 |       "lr": 1e-3,
11 |       "weight_decay": 0.01,
12 |       "bias_correction": false
13 |     }
14 |   },
15 |   "gradient_clipping": 1.0,
16 |   "wall_clock_breakdown": false,
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0
20 |   },
21 |   "progressive_layer_drop": {
22 |     "enabled": true,
23 |     "theta": 0.5,
24 |     "gamma": 0.001
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/training/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 65536,
 3 |   "train_micro_batch_size_per_gpu": 64,
 4 |   "steps_per_print": 1000,
 5 |   "prescale_gradients": false,
 6 |   "optimizer": {
 7 |     "type": "Lamb",
 8 |     "params": {
 9 |       "lr": 11e-3,
10 |       "weight_decay": 0.01,
11 |       "bias_correction": false,
12 |       "max_coeff": 0.3,
13 |       "min_coeff": 0.01
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 | 
18 |   "wall_clock_breakdown": false,
19 | 
20 |   "fp16": {
21 |     "enabled": true,
22 |     "loss_scale": 0
23 |   },
24 |   "sparse_attention": {
25 |     "mode": "fixed",
26 |     "block": 16,
27 |     "different_layout_per_head": true,
28 |     "num_local_blocks": 4,
29 |     "num_global_blocks": 1,
30 |     "attention": "bidirectional",
31 |     "horizontal_global_attention": false,
32 |     "num_different_global_patterns": 4
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/training/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script runs deepspeed using sparse attention for BertEncoderLayer.
 4 | 
 5 | base_dir=`pwd`
 6 | 
 7 | # Where should we save checkpoints and tensorboard events?
 8 | JOB_NAME=lamb_64k_seq128
 9 | OUTPUT_DIR=${base_dir}/bert_model_outputs
10 | 
11 | mkdir -p $OUTPUT_DIR
12 | 
13 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
14 | --cf ${base_dir}/bert_large_lamb.json \
15 | --max_seq_length 128 \
16 | --output_dir $OUTPUT_DIR \
17 | --deepspeed \
18 | --deepspeed_sparse_attention \
19 | --print_steps 100 \
20 | --lr_schedule "EE" \
21 | --lr_offset 10e-4 \
22 | --job_name $JOB_NAME \
23 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
24 | --data_path_prefix /data/bert \
25 | &> ${JOB_NAME}.log
26 | 


--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_bsz32k_seq512.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | base_dir=`pwd`
 4 | 
 5 | # Where should we save checkpoints and tensorboard events?
 6 | JOB_NAME=lamb_32k_chkpt150_seq512
 7 | OUTPUT_DIR=${base_dir}/bert_model_outputs
 8 | 
 9 | # Assumes job name in previous seq128 run, will resume training from epoch 150
10 | CHECKPOINT_BASE_PATH=${OUTPUT_DIR}/saved_models/lamb_64k_seq128
11 | CHECKPOINT_EPOCH150_NAME=`basename ${CHECKPOINT_BASE_PATH}/epoch150_*`
12 | echo "checkpoint id: $CHECKPOINT_EPOCH150_NAME"
13 | 
14 | mkdir -p $OUTPUT_DIR
15 | 
16 | deepspeed ${base_dir}/deepspeed_train.py \
17 | --cf ${base_dir}/bert_large_lamb.json \
18 | --max_seq_length 512 \
19 | --output_dir $OUTPUT_DIR \
20 | --print_steps 100 \
21 | --deepspeed \
22 | --deepspeed_transformer_kernel \
23 | --job_name $JOB_NAME \
24 | --deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \
25 | --data_path_prefix /data/bert \
26 | --validation_data_path_prefix /data/bert \
27 | --rewarmup \
28 | --lr_schedule "EE" \
29 | --attention_dropout_checkpoint \
30 | --lr_offset 0.0 \
31 | --load_training_checkpoint ${CHECKPOINT_BASE_PATH} \
32 | --load_checkpoint_id ${CHECKPOINT_EPOCH150_NAME} \
33 | &> ${JOB_NAME}.log
34 | 


--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_bsz64k_seq128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | base_dir=`pwd`
 4 | 
 5 | # Where should we save checkpoints and tensorboard events?
 6 | JOB_NAME=lamb_64k_seq128
 7 | OUTPUT_DIR=${base_dir}/bert_model_outputs
 8 | 
 9 | mkdir -p $OUTPUT_DIR
10 | 
11 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
12 | --cf ${base_dir}/bert_large_lamb.json \
13 | --max_seq_length 128 \
14 | --output_dir $OUTPUT_DIR \
15 | --deepspeed \
16 | --deepspeed_transformer_kernel \
17 | --print_steps 100 \
18 | --lr_schedule "EE" \
19 | --lr_offset 10e-4 \
20 | --job_name $JOB_NAME \
21 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
22 | --data_path_prefix /data/bert \
23 | &> ${JOB_NAME}.log
24 | 


--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_nvidia_data_bsz32k_seq512.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z $1 ]]; then
 4 |     LOAD_EPOCH=16
 5 | else
 6 |     LOAD_EPOCH=$1
 7 | fi
 8 | base_dir=`pwd`
 9 | 
10 | # Where should we save checkpoints and tensorboard events?
11 | JOB_NAME=lamb_nvidia_data_32k_chkpt${LOAD_EPOCH}_seq512
12 | OUTPUT_DIR=${base_dir}/bert_model_nvidia_data_outputs
13 | 
14 | # Assumes job name in previous seq128 run, will resume training from epoch 18 by default
15 | CHECKPOINT_BASE_PATH=${OUTPUT_DIR}/saved_models/lamb_nvidia_data_64k_seq128
16 | CHECKPOINT_EPOCH_NAME=`basename ${CHECKPOINT_BASE_PATH}/epoch${LOAD_EPOCH}_*`
17 | echo "checkpoint id: $CHECKPOINT_EPOCH_NAME"
18 | 
19 | mkdir -p $OUTPUT_DIR
20 | 
21 | deepspeed ${base_dir}/deepspeed_train.py \
22 | --cf ${base_dir}/bert_large_lamb_nvidia_data.json \
23 | --max_seq_length 512 \
24 | --output_dir $OUTPUT_DIR \
25 | --print_steps 1 \
26 | --deepspeed \
27 | --deepspeed_transformer_kernel \
28 | --job_name $JOB_NAME \
29 | --deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \
30 | --data_path_prefix /workspace/bert \
31 | --use_nvidia_dataset \
32 | --rewarmup \
33 | --lr_schedule "EE" \
34 | --attention_dropout_checkpoint \
35 | --lr_offset 0.0 \
36 | --load_training_checkpoint ${CHECKPOINT_BASE_PATH} \
37 | --load_checkpoint_id ${CHECKPOINT_EPOCH_NAME} \
38 | &> ${JOB_NAME}.log
39 | 


--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_nvidia_data_bsz64k_seq128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | base_dir=`pwd`
 4 | 
 5 | # Where should we save checkpoints and tensorboard events?
 6 | JOB_NAME=lamb_nvidia_data_64k_seq128
 7 | OUTPUT_DIR=${base_dir}/bert_model_nvidia_data_outputs
 8 | 
 9 | mkdir -p $OUTPUT_DIR
10 | 
11 | NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
12 | --cf ${base_dir}/bert_large_lamb_nvidia_data.json \
13 | --max_seq_length 128 \
14 | --output_dir $OUTPUT_DIR \
15 | --deepspeed \
16 | --deepspeed_transformer_kernel \
17 | --print_steps 100 \
18 | --lr_schedule "EE" \
19 | --lr_offset 10e-4 \
20 | --job_name $JOB_NAME \
21 | --deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
22 | --data_path_prefix /workspace/bert \
23 | --use_nvidia_dataset \
24 | &> ${JOB_NAME}.log
25 | 


--------------------------------------------------------------------------------
/training/bing_bert/ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | base_dir=`pwd`
 4 | 
 5 | # Where should we save checkpoints and tensorboard events?
 6 | JOB_NAME=adam_4k_seq128_progressive_layer_drop
 7 | OUTPUT_DIR=${base_dir}/bert_model_outputs
 8 | 
 9 | mkdir -p $OUTPUT_DIR
10 | 
11 | config="--progressive_layer_drop"
12 | 
13 | NCCL_TREE_THRESHOLD=0 deepspeed \
14 | ${base_dir}/deepspeed_train.py \
15 | --cf ${base_dir}/bert_base_large_lr.json \
16 | --max_seq_length 128 \
17 | --output_dir $OUTPUT_DIR \
18 | --deepspeed \
19 | --print_steps 100 \
20 | --lr_schedule "LE" \
21 | --job_name $JOB_NAME \
22 | --deepspeed_config ${base_dir}/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json \
23 | --data_path_prefix /data/bert \
24 | ${config} \
25 | &> ${JOB_NAME}.log
26 | 


--------------------------------------------------------------------------------
/training/bing_bert/glue_bert_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 32,
 3 |     "train_micro_batch_size_per_gpu": 32,
 4 |     "steps_per_print": 10,
 5 |     "optimizer": {
 6 |       "type": "Adam",
 7 |       "params": {
 8 |         "lr": 3e-5,
 9 |         "weight_decay": 0.0,
10 |         "bias_correction": false
11 |       }
12 |     },
13 |     "gradient_clipping": 1.0,
14 |     "fp16": {
15 |       "enabled": true
16 |     }
17 |   
18 |   }
19 |   


--------------------------------------------------------------------------------
/training/bing_bert/glue_bert_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 32,
 3 |     "train_micro_batch_size_per_gpu": 4,
 4 |     "steps_per_print": 10,
 5 |     "optimizer": {
 6 |       "type": "Adam",
 7 |       "params": {
 8 |         "lr": 3e-5,
 9 |         "weight_decay": 0.0,
10 |         "bias_correction": false
11 |       }
12 |     },
13 |     "gradient_clipping": 1.0,
14 |     "fp16": {
15 |       "enabled": true
16 |     }
17 |   
18 |   }
19 |   


--------------------------------------------------------------------------------
/training/bing_bert/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 |                        BertForMaskedLM, BertForNextSentencePrediction,
5 |                        BertForSequenceClassification, BertForMultipleChoice,
6 |                        BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 | 


--------------------------------------------------------------------------------
/training/bing_bert/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | def main():
 3 |     import sys
 4 |     try:
 5 |         from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 6 |     except ModuleNotFoundError:
 7 |         print(
 8 |             "pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
 9 |             "In that case, it requires TensorFlow to be installed. Please see "
10 |             "https://www.tensorflow.org/install/ for installation instructions."
11 |         )
12 |         raise
13 | 
14 |     if len(sys.argv) != 5:
15 |         # pylint: disable=line-too-long
16 |         print(
17 |             "Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`"
18 |         )
19 |     else:
20 |         PYTORCH_DUMP_OUTPUT = sys.argv.pop()
21 |         TF_CONFIG = sys.argv.pop()
22 |         TF_CHECKPOINT = sys.argv.pop()
23 |         convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG,
24 |                                          PYTORCH_DUMP_OUTPUT)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/training/bing_bert/requirements.txt:
--------------------------------------------------------------------------------
1 | sklearn
2 | 


--------------------------------------------------------------------------------
/training/bing_bert/turing/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch.distributed as dist
 3 | 
 4 | logging.basicConfig(
 5 |     format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 6 |     datefmt='%m/%d/%Y %H:%M:%S',
 7 |     level=logging.INFO)
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class Logger():
12 |     def __init__(self, cuda=False):
13 |         self.logger = logging.getLogger(__name__)
14 |         self.cuda = cuda
15 | 
16 |     def info(self, message, *args, **kwargs):
17 |         if (self.cuda and dist.get_rank() == 0) or not self.cuda:
18 |             self.logger.info(message, *args, **kwargs)
19 | 
20 |     def error(self, message, *args, **kwargs):
21 |         self.logger.error(message, *args, **kwargs)
22 | 


--------------------------------------------------------------------------------
/training/bing_bert/turing/text.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | PAD = 0
 4 | 
 5 | 
 6 | def mask(x):
 7 |     return x != PAD
 8 | 
 9 | 
10 | def torch_long(x):
11 |     return torch.LongTensor(x)
12 | 


--------------------------------------------------------------------------------
/training/cifar/README.md:
--------------------------------------------------------------------------------
 1 | Thanks Gopi Kumar for contributing this example, demonstrating how to apply DeepSpeed to CIFAR-10 model.
 2 | 
 3 | `cifar10_tutorial.py`
 4 |     Baseline CIFAR-10 model.
 5 | 
 6 | `cifar10_deepspeed.py`
 7 |     DeepSpeed applied CIFAR-10 model.
 8 | 
 9 | `run_ds.sh`
10 |     Script for running DeepSpeed applied model.
11 | 
12 | `run_ds_moe.sh`
13 |     Script for running DeepSpeed model with Mixture of Experts (MoE) integration.
14 | 
15 | `run_ds_prmoe.sh`
16 |     Script for running DeepSpeed model with Pyramid Residual MoE (PR-MoE) integration.
17 | 
18 | * To run baseline CIFAR-10 model - `python cifar10_tutorial.py`
19 | * To run DeepSpeed CIFAR-10 model - `bash run_ds.sh`
20 | * To run DeepSpeed CIFAR-10 model with Mixture of Experts (MoE) - `bash run_ds_moe.sh`
21 | * To run DeepSpeed CIFAR-10 model with Pyramid Residual MoE (PR-MoE) - `bash run_ds_prmoe.sh`
22 | * To run with different data type (default=`fp16`) and zero stages (default=`0`) - `bash run_ds.sh --dtype={fp16|bf16} --stage={0|1|2|3}`
23 | 


--------------------------------------------------------------------------------
/training/cifar/requirements.txt:
--------------------------------------------------------------------------------
1 | torchvision==0.4.0
2 | pillow>=7.1.0
3 | matplotlib
4 | 


--------------------------------------------------------------------------------
/training/cifar/run_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | deepspeed --bind_cores_to_rank cifar10_deepspeed.py --deepspeed $@
4 | 


--------------------------------------------------------------------------------
/training/cifar/run_ds_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Number of nodes
 4 | NUM_NODES=1
 5 | # Number of GPUs per node
 6 | NUM_GPUS=2
 7 | # Size of expert parallel world (should be less than total world size)
 8 | EP_SIZE=2
 9 | # Number of total experts
10 | EXPERTS=2
11 | 
12 | deepspeed --num_nodes=${NUM_NODES}\
13 |           --num_gpus=${NUM_GPUS} \
14 |           --bind_cores_to_rank \
15 |         cifar10_deepspeed.py \
16 | 	--log-interval 100 \
17 | 	--deepspeed \
18 | 	--moe \
19 | 	--ep-world-size ${EP_SIZE} \
20 | 	--num-experts ${EXPERTS} \
21 | 	--top-k 1 \
22 | 	--noisy-gate-policy 'RSample' \
23 | 	--moe-param-group
24 | 


--------------------------------------------------------------------------------
/training/cifar/run_ds_prmoe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Number of nodes
 4 | NUM_NODES=1
 5 | # Number of GPUs per node
 6 | NUM_GPUS=2
 7 | # Size of expert parallel world (should be less than total world size)
 8 | EP_SIZE=2
 9 | # Number of total experts, note here we need to pass >= two numbers (numbers can be different)
10 | EXPERTS='2 4'
11 | 
12 | deepspeed --num_nodes=${NUM_NODES} --num_gpus=${NUM_GPUS} cifar10_deepspeed.py \
13 | 	--log-interval 100 \
14 | 	--deepspeed \
15 | 	--moe \
16 | 	--ep-world-size ${EP_SIZE} \
17 | 	--num-experts ${EXPERTS} \
18 | 	--top-k 1 \
19 | 	--mlp-type 'residual' \
20 | 	--noisy-gate-policy 'RSample' \
21 | 	--moe-param-group
22 | 


--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/bash_script/run_medium_random_ltd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ##################apply random-ltd to fine-tune ptb on GPT-medium (24-layer)##############################
 3 | ####see more on random-ltd: https://arxiv.org/abs/2211.11586
 4 | export CUDA_VISIBLE_DEVICES=2
 5 | mkdir -p ./output/check_medium
 6 | python -m torch.distributed.launch --nproc_per_node=1 \
 7 |     --master_port 12345 \
 8 |     run_clm_no_trainer.py \
 9 |     --random_ltd \
10 |     --dataset_name ptb_text_only \
11 |     --dataset_config_name penn_treebank \
12 |     --model_name_or_path gpt2-medium \
13 |     --per_device_train_batch_size 2 \
14 |     --per_device_eval_batch_size 2 \
15 |     --num_train_epochs 2 \
16 |     --deepspeed_config config/ds_config_gpt_medium_random_ltd.json \
17 |     --deepspeed --seed 1234 --num_warmup_steps 100 \
18 |     --output_dir ./output/check_medium &> ./output/check_medium/training.log


--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/config/ds_config_gpt_base_random_ltd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : 4,
 3 |   "train_micro_batch_size_per_gpu": 2,
 4 |   "steps_per_print": 2,
 5 |   "optimizer": {
 6 |     "type": "Adam",
 7 |     "params": {
 8 |       "lr": 0.0001,
 9 |       "betas": [0.8,0.999],
10 |       "eps": 1e-8,
11 |       "weight_decay": 3e-7
12 |     }
13 |   },
14 |   "zero_optimization": {
15 |     "stage": 0
16 |   },
17 |   "fp16":{
18 |     "enabled": false
19 |   },
20 |   "gradient_clipping": 1.0,
21 |   "prescale_gradients": true,
22 |   "wall_clock_breakdown" : false,
23 |   "data_efficiency": {
24 |     "enabled": true,
25 |     "data_routing": {
26 |       "enabled": true,
27 |       "random_ltd":{
28 |         "enabled": true,
29 |         "total_layer_num": 12,
30 |         "random_ltd_layer_num": 10,
31 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
32 |         "model_mask_name": "attention_mask",
33 |         "model_type": "decoder",
34 |         "hidden_state_order": "batch_seq_dim",
35 |         "random_ltd_schedule": {
36 |           "min_value": 128,
37 |           "max_value": 1024,
38 |           "schedule_type": "fixed_linear",
39 |           "schedule_config": {
40 |             "require_steps": 400,
41 |             "seq_per_step": 8
42 |           }
43 |         }
44 |       } 
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_reduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set these 2 to the same as what you used during map job. We need these 2
 4 | # configs to know how many map job result files do we have.
 5 | num_workers=1
 6 | num_threads=1
 7 | # Reduce job only has 1 worker but can accelerate by multithreading.
 8 | num_threads_reduce=1
 9 | 
10 | save_path="/blob/users/conglli/data/analysis_ptb_gpt/"
11 | 
12 | metric='total_vocab_freq'
13 | # metric='vocab_rarity' # this requires the result of total_vocab_freq
14 | 
15 | dataset_name="ptb_text_only"
16 | dataset_config_name="penn_treebank"
17 | model_name_or_path="gpt2-medium"
18 | 
19 | batch_size=1000
20 | 
21 | jobname="gpt-ptb-analyzing-${metric}-reduce"
22 | 
23 | options=" \
24 |     --analyzing_task reduce \
25 |     --analyzing_metric ${metric} \
26 |     --analyzing_num_workers ${num_workers} \
27 |     --analyzing_num_threads ${num_threads} \
28 |     --analyzing_num_threads_reduce ${num_threads_reduce} \
29 |     --dataset_name ${dataset_name} \
30 |     --dataset_config_name ${dataset_config_name} \
31 |     --model_name_or_path ${model_name_or_path} \
32 |     --per_device_train_batch_size ${batch_size} \
33 |     --output_dir ${save_path}"
34 | 
35 | python ../analyze_data.py ${options} &> ${jobname}.log


--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/finetune/ds_config_gpt2_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : GB_SIZE,
 3 |   "train_micro_batch_size_per_gpu": MB_SIZE,
 4 |   "steps_per_print": 10,
 5 |   "zero_optimization": {
 6 |     "stage": 0
 7 |   },
 8 |   "fp16":{
 9 |     "enabled": false
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": true,
13 |   "wall_clock_breakdown" : false,
14 |   "data_efficiency": {
15 |     "enabled": true,
16 |     "data_routing": {
17 |       "enabled": LTD_ENABLED,
18 |       "random_ltd":{
19 |         "enabled": LTD_ENABLED,
20 |         "total_layer_num": 12,
21 |         "random_ltd_layer_num": 10,
22 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
23 |         "model_mask_name": "attention_mask",
24 |         "model_type": "decoder",
25 |         "hidden_state_order": "batch_seq_dim",
26 |         "random_ltd_schedule": {
27 |           "min_value": LTD_MIN,
28 |           "max_value": 1024,
29 |           "schedule_type": "fixed_linear",
30 |           "schedule_config": {
31 |             "require_steps": LTD_STEP,
32 |             "seq_per_step": 8
33 |           }
34 |         }
35 |       } 
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/training/data_efficiency/gpt_finetuning/requirement.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.8.0
2 | sentencepiece != 0.1.92
3 | protobuf
4 | transformers == 4.15.0
5 | accelerate


--------------------------------------------------------------------------------
/training/data_efficiency/variable_batch_size_and_lr/variable_attn_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_attn_matrix.png


--------------------------------------------------------------------------------
/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr.png


--------------------------------------------------------------------------------
/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/data_efficiency/variable_batch_size_and_lr/variable_batch_lr_pipeline.png


--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/bash_script/run_cifar_random_ltd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0
 4 | mkdir -p out/cifar/
 5 | # deepspeed --include worker-0:0 --master_port 60000 main_cifar.py      \
 6 | #     --deepspeed_config config/ds_config.json  \
 7 | #     --deepspeed   --random_ltd  \
 8 | #     --dataset cifar10vit224      \
 9 | #     --seed 1234                   \
10 | #     --printfreq 400                \
11 | #     --arch lvits16r224             \
12 | #     --optimizer sgd  \
13 | #     --lr 0.0001 --seq_len 197       \
14 | #     --scheduler constant    \
15 | #     --epochs 14  \
16 | #     --batchsize 32 \
17 | #     --data_outdir check/cifar/ | tee -a check/cifar/training.log
18 | 
19 | deepspeed  --num_nodes 1 --num_gpus 1  --master_port 60000 main_cifar.py      \
20 |     --deepspeed_config config/ds_config_cifar_random_ltd.json  \
21 |     --deepspeed   --random_ltd  \
22 |     --dataset cifar10vit224      \
23 |     --seed 1234                   \
24 |     --printfreq 400                \
25 |     --arch vits16r224             \
26 |     --optimizer sgd  \
27 |     --lr 0.0001 --seq_len 197       \
28 |     --scheduler constant    \
29 |     --epochs 14  \
30 |     --batchsize 128 \
31 |     --data_outdir out/cifar/ | tee -a out/cifar/training1.log


--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/config/ds_config_cifar_random_ltd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : 32,
 3 |   "train_micro_batch_size_per_gpu": 32,
 4 |   "steps_per_print": 200,
 5 |   "optimizer": {
 6 |     "type": "Adam",
 7 |     "params": {
 8 |       "lr": 0.0001,
 9 |       "betas": [0.8,0.999],
10 |       "eps": 1e-8,
11 |       "weight_decay": 3e-7
12 |     }
13 |   },
14 |   "zero_optimization": {
15 |     "stage": 0
16 |   },
17 |   "fp16":{
18 |     "enabled": false
19 |   },
20 |   "gradient_clipping": 1.0,
21 |   "prescale_gradients": true,
22 |   "wall_clock_breakdown" : false,
23 |   "data_efficiency": {
24 |     "enabled": true,
25 |     "data_routing": {
26 |       "enabled": true,
27 |       "random_ltd":{
28 |         "enabled": true,
29 |         "total_layer_num": 12,
30 |         "random_ltd_layer_num": 10,
31 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
32 |         "model_mask_name": null,
33 |         "model_type": "decoder",
34 |         "hidden_state_order": "batch_seq_dim",
35 |         "random_ltd_schedule": {
36 |           "min_value": 32,
37 |           "max_value": 197,
38 |           "schedule_type":"fixed_linear",
39 |           "schedule_config": {
40 |             "require_steps": 3910,
41 |             "seq_per_step": 8
42 |           }
43 |         }
44 |       } 
45 |     }
46 |   }
47 | }
48 |   


--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/config/ds_config_imagenet_random_ltd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : 256,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 200,
 5 |   "optimizer": {
 6 |     "type": "Adam",
 7 |     "params": {
 8 |       "lr": 0.0001,
 9 |       "betas": [0.8,0.999],
10 |       "eps": 1e-8,
11 |       "weight_decay": 3e-7
12 |     }
13 |   },
14 |   "zero_optimization": {
15 |     "stage": 0
16 |   },
17 |   "fp16":{
18 |     "enabled": false
19 |   },
20 |   "gradient_clipping": 1.0,
21 |   "prescale_gradients": true,
22 |   "wall_clock_breakdown" : false,
23 |   "data_efficiency": {
24 |     "enabled": true,
25 |     "data_routing": {
26 |       "enabled": true,
27 |       "random_ltd":{
28 |         "enabled": true,
29 |         "total_layer_num": 12,
30 |         "random_ltd_layer_num": 10,
31 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10],
32 |         "model_mask_name": null,
33 |         "model_type": "decoder",
34 |         "hidden_state_order": "batch_seq_dim",
35 |         "random_ltd_schedule": {
36 |           "min_value": 32,
37 |           "max_value": 197,
38 |           "schedule_type":"fixed_linear",
39 |           "schedule_config": {
40 |             "require_steps": 3910,
41 |             "seq_per_step": 8
42 |           }
43 |         }
44 |       } 
45 |     }
46 |   }
47 | }
48 |   


--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import absolute_import
16 | from .vit import *
17 | from .vit import Block
18 | 


--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/requirement.txt:
--------------------------------------------------------------------------------
1 | timm==0.6.5
2 | torch>1.10.0
3 | torchvision>0.11.1
4 | mpi4py
5 | 


--------------------------------------------------------------------------------
/training/data_efficiency/vit_finetuning/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .utils import get_model, get_optimizer, get_scheduler, LossTracker, AverageMeter, ProgressMeter, accuracy,run_cmd
16 | from .get_data import get_dataset
17 | 
18 | __all__ = [ "get_dataset", "ImageMemFolder", "AverageMeter", "ProgressMeter", "accuracy", "get_optimizer", "get_scheduler", "get_model", "LossTracker","run_cmd"]
19 |  


--------------------------------------------------------------------------------
/training/gan/gan_baseline_run.sh:
--------------------------------------------------------------------------------
1 | python gan_baseline_train.py --dataset celeba --cuda --tensorboard_path './runs/baseline'
2 | 


--------------------------------------------------------------------------------
/training/gan/gan_deepspeed_config.json:
--------------------------------------------------------------------------------
 1 |  {
 2 |   "train_batch_size" : 64,
 3 |    "optimizer": {
 4 |     "type": "Adam",
 5 |     "params": {
 6 |       "lr": 0.0002,
 7 |       "betas": [
 8 |         0.5,
 9 |         0.999
10 |       ],
11 |       "eps": 1e-8
12 |     }
13 |   },
14 |   "steps_per_print" : 10
15 | }
16 | 


--------------------------------------------------------------------------------
/training/gan/gan_deepspeed_run.sh:
--------------------------------------------------------------------------------
1 | deepspeed gan_deepspeed_train.py --dataset celeba --cuda --deepspeed_config gan_deepspeed_config.json --tensorboard_path './runs/deepspeed'
2 | 


--------------------------------------------------------------------------------
/training/imagenet/assets/resnetplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeedExamples/4579df3f6ba4bcf28d6dc99a2e11e4144da52b4b/training/imagenet/assets/resnetplot.png


--------------------------------------------------------------------------------
/training/imagenet/config/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 256,
 3 |     "gradient_accumulation_steps": 1,
 4 |     "steps_per_print": 50,
 5 | 
 6 |     "optimizer": {
 7 | 	"type": "Adam",
 8 | 	"params": {
 9 | 	    "lr": 0.001,
10 | 	    "betas": [
11 | 		0.8,
12 | 		0.999
13 | 	    ],
14 | 	    "eps": 1e-8,
15 | 	    "weight_decay": 3e-7
16 | 	}
17 |     },
18 | 
19 |     "zero_optimization": {
20 | 	"stage": 0
21 |     },
22 |     "zero_allow_untested_optimizer": true,
23 |     "fp16": {
24 | 	"enabled": false
25 |     },
26 |     "gradient_clipping": 0,
27 |     "prescale_gradients": false,
28 |     "cuda_visible_devices": 0,
29 |     "wall_clock_breakdown" : false
30 | }
31 | 


--------------------------------------------------------------------------------
/training/imagenet/config/ds_fp16_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 256,
 3 |     "gradient_accumulation_steps": 1,
 4 |     "steps_per_print": 50,
 5 | 
 6 |     "optimizer": {
 7 | 	"type": "Adam",
 8 | 	"params": {
 9 | 	    "lr": 0.001,
10 | 	    "betas": [
11 | 		0.8,
12 | 		0.999
13 | 	    ],
14 | 	    "eps": 1e-8,
15 | 	    "weight_decay": 3e-7
16 | 	}
17 |     },
18 | 
19 |     "zero_optimization": {
20 | 	"stage": 0
21 |     },
22 |     "zero_allow_untested_optimizer": true,
23 |     "fp16": {
24 | 	"enabled": true,
25 | 	"auto_cast": true
26 |     },
27 |     "gradient_clipping": 0,
28 |     "prescale_gradients": false,
29 |     "cuda_visible_devices": 0,
30 |     "wall_clock_breakdown" : false
31 | }
32 | 


--------------------------------------------------------------------------------
/training/imagenet/config/ds_fp16_z1_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 256,
 3 |     "gradient_accumulation_steps": 1,
 4 |     "steps_per_print": 50,
 5 | 
 6 |     "optimizer": {
 7 | 	"type": "Adam",
 8 | 	"params": {
 9 | 	    "lr": 0.001,
10 | 	    "betas": [
11 | 		0.8,
12 | 		0.999
13 | 	    ],
14 | 	    "eps": 1e-8,
15 | 	    "weight_decay": 3e-7
16 | 	}
17 |     },
18 | 
19 |     "zero_optimization": {
20 | 	"stage": 1
21 |     },
22 |     "zero_allow_untested_optimizer": true,
23 |     "fp16": {
24 | 	"enabled": true,
25 | 	"auto_cast": true
26 |     },
27 |     "gradient_clipping": 0,
28 |     "prescale_gradients": false,
29 |     "cuda_visible_devices": 0,
30 |     "wall_clock_breakdown" : false
31 | }
32 | 


--------------------------------------------------------------------------------
/training/imagenet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | 


--------------------------------------------------------------------------------
/training/imagenet/run_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet
4 | 


--------------------------------------------------------------------------------
/training/imagenet/run_ds_fp16.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet
4 | 


--------------------------------------------------------------------------------
/training/imagenet/run_ds_fp16_z1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_z1_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet
4 | 


--------------------------------------------------------------------------------
/training/megatron/README.md:
--------------------------------------------------------------------------------
1 | # Not maintained / deprecated
2 | 
3 | > __Warning__
4 | > all future/current changes are now in new [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed).
5 | 


--------------------------------------------------------------------------------
/training/offload_states/output_table.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pytablewriter import MarkdownTableWriter
 3 | 
 4 | 
 5 | def read_csv(file_path):
 6 |     return pd.read_csv(file_path)
 7 | 
 8 | df = read_csv('offload_states.log')
 9 | df.columns = ['pin_memory', 'non_blocking', 'offload_time', 'load_time']
10 | 
11 | df['ratio_string'] = df['offload_time'].round(2).astype(str) + " / " + df['load_time'].round(2).astype(str)
12 | 
13 | result_df = pd.DataFrame({
14 |     'pin_memory=0_non_blocking=0': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True),
15 |     'pin_memory=0_non_blocking=1': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True),
16 |     'pin_memory=1_non_blocking=0': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True),
17 |     'pin_memory=1_non_blocking=1': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True)
18 | })
19 | result_df = result_df.dropna()
20 | result_df.index = range(1, len(result_df) + 1)
21 | result_df.index.name = 'trial'
22 | # print(result_df)
23 | 
24 | writer = MarkdownTableWriter()
25 | writer.from_dataframe(result_df,
26 |     add_index_column=True,
27 | )
28 | writer.write_table()


--------------------------------------------------------------------------------
/training/offload_states/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | NGPUS=4
 2 | HIDDEN_SIZE=32768
 3 | NUM_LAYERS=4
 4 | 
 5 | TRIALS=10
 6 | 
 7 | PIN_MEMORY_OPTS=(0 1)
 8 | NON_BLOCKING_OPTS=(0 1)
 9 | 
10 | for i in $(seq 1 $TRIALS); do
11 |     for PIN_MEMORY in "${PIN_MEMORY_OPTS[@]}"; do
12 |         PIN_MEMORY_ARG=""
13 |         if [ $PIN_MEMORY -eq 1 ]; then
14 |             PIN_MEMORY_ARG="--pin_memory"
15 |         fi
16 | 
17 |         for NON_BLOCKING in "${NON_BLOCKING_OPTS[@]}"; do
18 |             NON_BLOCKING_ARG=""
19 |             if [ $NON_BLOCKING -eq 1 ]; then
20 |                 NON_BLOCKING_ARG="--non_blocking"
21 |             fi
22 | 
23 |             echo "Running iteration $i"
24 |             deepspeed --num_gpus=$NGPUS offload_states.py --hidden_dim $HIDDEN_SIZE --nlayers $NUM_LAYERS $PIN_MEMORY_ARG $NON_BLOCKING_ARG
25 |         done
26 |     done
27 | done
28 | python output_table.py
29 | 


--------------------------------------------------------------------------------
/training/pipeline_parallelism/ds_config.json:
--------------------------------------------------------------------------------
 1 |  {
 2 |   "train_batch_size" : 256,
 3 |   "train_micro_batch_size_per_gpu" : 8,
 4 | 
 5 |    "optimizer": {
 6 |     "type": "Adam",
 7 |     "params": {
 8 |       "lr": 0.001,
 9 |       "betas": [
10 |         0.9,
11 |         0.999
12 |       ],
13 |       "eps": 1e-8
14 |     }
15 |   },
16 |   
17 |   "steps_per_print" : 10,
18 |   "wall_clock_breakdown" : false
19 |  }
20 | 


--------------------------------------------------------------------------------
/training/pipeline_parallelism/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | deepspeed train.py --deepspeed_config=ds_config.json -p 2 --steps=200
4 | 


--------------------------------------------------------------------------------
/training/stable_diffusion/mytrainbash.sh:
--------------------------------------------------------------------------------
 1 | export MODEL_NAME="stabilityai/stable-diffusion-2-1-base"
 2 | export OUTPUT_DIR="./sd-distill-v21"
 3 | 
 4 | if [ ! -d "$OUTPUT_DIR" ]; then
 5 |     mkdir "$OUTPUT_DIR"
 6 |     echo "Folder '$OUTPUT_DIR' created"
 7 | else
 8 |     echo "Folder '$OUTPUT_DIR' already exists"
 9 | fi
10 |     
11 | 
12 | accelerate launch train_sd_distil_lora.py \
13 | 	   --pretrained_model_name_or_path=$MODEL_NAME  \
14 | 	   --output_dir=$OUTPUT_DIR \
15 | 	   --default_prompt="A man dancing" \
16 | 	   --resolution=512 \
17 | 	   --train_batch_size=1 \
18 | 	   --gradient_accumulation_steps=1 \
19 | 	   --learning_rate=5e-6 \
20 | 	   --lr_scheduler="constant" \
21 | 	   --lr_warmup_steps=0
22 | 


--------------------------------------------------------------------------------
/training/stable_diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | 


--------------------------------------------------------------------------------
/training/tensor_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # tensor parallel example
 2 | This project is adapted from https://github.com/tatsu-lab/stanford_alpaca.
 3 | We only modified the ds_config to enable tensor parallelism and more detailed logging, as an example use case.
 4 | 
 5 | **Script**
 6 | 
 7 | ``` bash run.sh ``` or ```bash run.sh MODE``` 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/training/tensor_parallel/configs/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |       "enabled": "auto"
 4 |     },
 5 |     "optimizer": {
 6 |       "type": "AdamW",
 7 |       "params": {
 8 |         "lr": "auto",
 9 |         "betas": "auto",
10 |         "eps": "auto",
11 |         "weight_decay": "auto"
12 |       }
13 |     },
14 |     "scheduler": {
15 |       "type": "WarmupDecayLR",
16 |       "params": {
17 |         "total_num_steps": "auto",
18 |         "warmup_min_lr": "auto",
19 |         "warmup_max_lr": "auto",
20 |         "warmup_num_steps": "auto"
21 |       }
22 |     },
23 |     "zero_optimization": {
24 |       "stage": 1,
25 |       "gather_16bit_weights_on_model_save": true
26 |     },
27 |     "tensor_parallel":{
28 |       "autotp_size": 4
29 |     },
30 |     "gradient_accumulation_steps": "auto",
31 |     "gradient_clipping": "auto",
32 |     "steps_per_print": 1,
33 |     "train_batch_size": "auto",
34 |     "train_micro_batch_size_per_gpu": "auto",
35 |     "wall_clock_breakdown": false
36 | }


--------------------------------------------------------------------------------
/training/tensor_parallel/configs/ds_config_temp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |       "enabled": "auto"
 4 |     },
 5 |     "optimizer": {
 6 |       "type": "AdamW",
 7 |       "params": {
 8 |         "lr": "auto",
 9 |         "betas": "auto",
10 |         "eps": "auto",
11 |         "weight_decay": "auto"
12 |       }
13 |     },
14 |     "scheduler": {
15 |       "type": "WarmupDecayLR",
16 |       "params": {
17 |         "total_num_steps": "auto",
18 |         "warmup_min_lr": "auto",
19 |         "warmup_max_lr": "auto",
20 |         "warmup_num_steps": "auto"
21 |       }
22 |     },
23 |     "zero_optimization": {
24 |       "stage": ${zero_stage},
25 |       "gather_16bit_weights_on_model_save": true
26 |     },
27 |     "tensor_parallel":{
28 |       "autotp_size": ${autotp_size}
29 |     },
30 |     "gradient_accumulation_steps": "auto",
31 |     "gradient_clipping": "auto",
32 |     "steps_per_print": 1,
33 |     "train_batch_size": "auto",
34 |     "train_micro_batch_size_per_gpu": "auto",
35 |     "wall_clock_breakdown": false
36 | }


--------------------------------------------------------------------------------
/training/tensor_parallel/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.50.1
 2 | deepspeed>=0.16.4
 3 | accelerate==1.6.0
 4 | numpy
 5 | rouge_score
 6 | fire
 7 | openai==0.28.0
 8 | torch
 9 | sentencepiece
10 | tokenizers>=0.13.3
11 | 


--------------------------------------------------------------------------------