├── .dev_scripts ├── build_docs.sh ├── ci_container_test.sh └── dockerci.sh ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── SECURITY.md └── workflows │ ├── citest.yaml │ ├── lint.yaml │ └── publish.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .pre-commit-config_local.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CONTRIBUTING_CN.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── README_CN.md ├── asset ├── banner.png ├── discord_qr.jpg └── wechat.png ├── docs ├── Makefile ├── README.md ├── make.bat ├── resources │ ├── dpo_data.png │ ├── grpo.png │ ├── grpo_clevr_count.png │ ├── grpo_code.png │ ├── grpo_countdown.png │ ├── grpo_countdown_1.png │ ├── grpo_geoqa.png │ ├── grpo_openr1_multimodal.png │ ├── kto_data.png │ ├── web-ui-en.jpg │ └── web-ui.jpg ├── source │ ├── .readthedocs.yaml │ ├── BestPractices │ │ ├── Embedding训练.md │ │ ├── GRPO代码训练.md │ │ ├── GRPO多模态训练.md │ │ ├── GRPO完整流程.md │ │ ├── NPU支持.md │ │ ├── 快速训练VL模型.md │ │ └── 更多最佳实践.md │ ├── Customization │ │ ├── 插件化.md │ │ ├── 自定义数据集.md │ │ └── 自定义模型.md │ ├── GetStarted │ │ ├── SWIFT安装.md │ │ ├── Web-UI.md │ │ └── 快速开始.md │ ├── Instruction │ │ ├── Agent支持.md │ │ ├── GRPO.md │ │ ├── Megatron-SWIFT训练.md │ │ ├── ReleaseNote3.0.md │ │ ├── 人类对齐.md │ │ ├── 使用tuners.md │ │ ├── 命令行参数.md │ │ ├── 导出与推送.md │ │ ├── 常见问题整理.md │ │ ├── 强化微调.md │ │ ├── 推理和部署.md │ │ ├── 支持的模型和数据集.md │ │ ├── 评测.md │ │ ├── 采样.md │ │ └── 预训练与微调.md │ ├── _templates │ │ ├── autosummary │ │ │ └── class.rst │ │ ├── classtemplate.rst │ │ └── sobolengine.rst │ ├── conf.py │ └── index.rst └── source_en │ ├── .readthedocs.yaml │ ├── BestPractices │ ├── Embedding.md │ ├── GRPO-Code-Training.md │ ├── GRPO-Multi-Modal-Training.md │ ├── GRPO.md │ ├── More-Best-Practices.md │ ├── NPU-support.md │ └── Rapidly-Training-VL-model.md │ ├── Customization │ ├── Custom-dataset.md │ ├── Custom-model.md │ └── Pluginization.md │ ├── GetStarted │ ├── Quick-start.md │ ├── SWIFT-installation.md │ └── Web-UI.md │ ├── Instruction │ ├── Agent-support.md │ ├── Command-line-parameters.md │ ├── Evaluation.md │ ├── Export-and-push.md │ ├── Frequently-asked-questions.md │ ├── GRPO.md │ ├── Inference-and-deployment.md │ ├── Megatron-SWIFT-Training.md │ ├── Pre-training-and-Fine-tuning.md │ ├── RLHF.md │ ├── Reinforced-Fine-tuning.md │ ├── ReleaseNote3.0.md │ ├── Sample.md │ ├── Supported-models-and-datasets.md │ └── Use-tuners.md │ ├── _templates │ ├── autosummary │ │ └── class.rst │ ├── classtemplate.rst │ └── sobolengine.rst │ ├── conf.py │ └── index.rst ├── examples ├── README.md ├── app │ ├── base_url │ │ ├── demo.py │ │ └── demo.sh │ ├── llm.sh │ └── mllm.sh ├── custom │ ├── dataset.py │ ├── infer.sh │ ├── model.py │ └── sft.sh ├── deploy │ ├── agent │ │ ├── client.py │ │ └── server.sh │ ├── bert │ │ ├── client.py │ │ └── server.sh │ ├── client │ │ ├── llm │ │ │ ├── base │ │ │ │ ├── openai_client.py │ │ │ │ └── swift_client.py │ │ │ └── chat │ │ │ │ ├── openai_client.py │ │ │ │ └── swift_client.py │ │ └── mllm │ │ │ ├── openai_client.py │ │ │ └── swift_client.py │ ├── lora │ │ ├── client.py │ │ └── server.sh │ ├── reward_model │ │ ├── client.py │ │ └── server.sh │ └── server │ │ ├── README.md │ │ └── demo.sh ├── eval │ ├── eval_url │ │ ├── demo.py │ │ └── eval.sh │ ├── llm │ │ └── eval.sh │ ├── train_eval │ │ └── train.sh │ └── vlm │ │ └── eval.sh ├── export │ ├── merge_lora.sh │ ├── ollama.sh │ ├── push_to_hub.sh │ └── quantize │ │ ├── awq.sh │ │ ├── bert │ │ ├── bnb.sh │ │ └── gptq.sh │ │ ├── bnb.sh │ │ ├── gptq.sh │ │ ├── mllm │ │ ├── awq.sh │ │ └── gptq.sh │ │ ├── moe │ │ ├── awq.sh │ │ └── gptq.sh │ │ ├── omni │ │ └── gptq.sh │ │ └── reward_model │ │ ├── bnb.sh │ │ └── gptq.sh ├── infer │ ├── cli_demo.sh │ ├── demo.py │ ├── demo_agent.py │ ├── demo_bert.py │ ├── demo_grounding.py │ ├── demo_hf.py │ ├── demo_lora.py │ ├── demo_mllm.py │ ├── demo_reward_model.py │ ├── lmdeploy │ │ └── mllm_tp.sh │ ├── pt │ │ ├── batch_ddp.sh │ │ ├── bert.sh │ │ ├── lora.sh │ │ ├── mllm_device_map.sh │ │ ├── prm.sh │ │ └── reward_model.sh │ └── vllm │ │ ├── dp_tp.sh │ │ ├── mllm_ddp.sh │ │ └── mllm_tp.sh ├── notebook │ ├── qwen2_5-self-cognition │ │ ├── infer.ipynb │ │ ├── infer.sh │ │ ├── self-cognition-sft.ipynb │ │ └── sft.sh │ ├── qwen2_5-vl-grounding │ │ └── zh.ipynb │ └── qwen2vl-ocr │ │ ├── infer.ipynb │ │ └── ocr-sft.ipynb ├── sampler │ ├── distill │ │ └── distill.sh │ └── mcts │ │ ├── mcts.py │ │ ├── mcts.sh │ │ └── system_prompt.txt └── train │ ├── agent │ ├── deepseek_r1.sh │ ├── glm4.sh │ ├── loss_scale │ │ ├── infer_lora.py │ │ └── train.sh │ └── qwen2_5.sh │ ├── all_to_all │ ├── infer.sh │ └── train.sh │ ├── base_to_chat │ ├── full.sh │ ├── lora.sh │ └── lora2.sh │ ├── embedding │ ├── train_gme.sh │ └── train_gte.sh │ ├── full │ ├── infer.sh │ ├── qwen2_5_32b.sh │ └── train.sh │ ├── grpo │ ├── external │ │ ├── README.md │ │ ├── agent.sh │ │ ├── grpo_32b_full.sh │ │ └── grpo_7b.sh │ ├── internal │ │ ├── README.md │ │ ├── full_lmdeploy.sh │ │ ├── pt.sh │ │ ├── vllm_72b_4gpu.sh │ │ ├── vllm_lora_qwenvl72b.sh │ │ ├── vllm_multi_round.sh │ │ └── vllm_vl7b.sh │ ├── multi_node │ │ ├── Qwen2_5_32B_full.sh │ │ ├── multi_node1.sh │ │ ├── multi_node2.sh │ │ └── train_dlc.sh │ ├── plugin │ │ ├── plugin.py │ │ ├── run_external_reward_func.sh │ │ └── run_external_reward_model.sh │ ├── prompt.txt │ └── qwen2_5_omni │ │ ├── grpo.sh │ │ └── infer.sh │ ├── infer.sh │ ├── lazy_tokenize │ └── train.sh │ ├── liger │ └── sft.sh │ ├── long_text │ ├── liger_kernel.sh │ ├── sequence_parallel.sh │ ├── sequence_parallel_512k.sh │ ├── sequence_parallel_dpo.sh │ └── sequence_parallel_grpo.sh │ ├── lora_sft.sh │ ├── megatron │ ├── base_to_chat.sh │ ├── benchmark │ │ └── deepspeed.sh │ ├── long_text.sh │ ├── moe.sh │ ├── multi-node │ │ ├── node1.sh │ │ └── node2.sh │ ├── pretrain.sh │ ├── qwen3_32b.sh │ ├── qwen3_moe.sh │ └── sft.sh │ ├── moe │ ├── llama4.sh │ └── qwen2_5_moe.sh │ ├── multi-gpu │ ├── ddp │ │ └── train.sh │ ├── ddp_device_map │ │ └── train.sh │ ├── deepspeed │ │ ├── train_zero2.sh │ │ └── train_zero3.sh │ ├── device_map │ │ └── train.sh │ └── fsdp_qlora │ │ ├── fsdp_offload.json │ │ └── train.sh │ ├── multi-node │ ├── accelerate │ │ ├── multi_node.yaml │ │ ├── train_node1.sh │ │ └── train_node2.sh │ ├── deepspeed │ │ ├── README.md │ │ ├── host.txt │ │ └── train.sh │ ├── dlc │ │ └── train.sh │ ├── swift │ │ ├── train_node1.sh │ │ └── train_node2.sh │ └── torchrun │ │ ├── train_node1.sh │ │ └── train_node2.sh │ ├── multimodal │ ├── audio.sh │ ├── caption.sh │ ├── grounding.sh │ ├── infer.sh │ ├── lora_llm_full_vit │ │ ├── custom_plugin.py │ │ ├── infer.sh │ │ ├── merge_lora.sh │ │ └── sft.sh │ ├── ocr.sh │ ├── omni │ │ ├── infer.sh │ │ └── sft.sh │ ├── rlhf │ │ ├── dpo │ │ │ ├── full.sh │ │ │ └── lora.sh │ │ └── kto.sh │ ├── video.sh │ └── vit_gradient_checkpointing.sh │ ├── optimizer │ └── muon.sh │ ├── packing │ ├── llm.sh │ ├── qwen2_5_omni.sh │ ├── qwen2_5_vl.sh │ └── streaming.sh │ ├── padding_free │ └── sft.sh │ ├── plugins │ ├── loss_scale.sh │ └── tuner_phi4_mm.sh │ ├── predict_with_generate │ └── train.sh │ ├── pretrain │ └── train.sh │ ├── qlora │ ├── awq.sh │ ├── bnb.sh │ ├── gptq.sh │ └── hqq.sh │ ├── rft │ ├── math.json │ └── rft.py │ ├── rlhf │ ├── README.md │ ├── cpo.sh │ ├── dpo │ │ ├── full.sh │ │ └── lora.sh │ ├── kto.sh │ ├── orpo.sh │ ├── ppo │ │ ├── full.sh │ │ └── lora.sh │ ├── rm.sh │ └── simpo.sh │ ├── seq_cls │ ├── bert │ │ ├── deploy.sh │ │ ├── infer.sh │ │ └── sft.sh │ ├── multi_label │ │ └── sft.sh │ ├── qwen2_5 │ │ ├── deploy.sh │ │ ├── infer.sh │ │ └── sft.sh │ ├── qwen2_vl │ │ ├── infer.sh │ │ └── sft.sh │ └── regression │ │ ├── deploy.sh │ │ ├── infer.sh │ │ └── sft.sh │ ├── streaming │ └── train.sh │ ├── think_model │ ├── deepseek_r1.sh │ ├── qwen3_demo1.sh │ └── qwen3_demo2.sh │ └── tuners │ ├── adalora │ └── train.sh │ ├── adapter │ └── train.sh │ ├── boft │ └── train.sh │ ├── bone │ └── train.sh │ ├── dora │ └── train.sh │ ├── galore │ ├── train_galore.sh │ └── train_qgalore.sh │ ├── lisa │ └── train.sh │ ├── llamapro │ └── train.sh │ ├── longlora │ └── train.sh │ ├── lora-ga │ └── train.sh │ ├── lora │ └── train.sh │ ├── neftune │ └── train.sh │ ├── olora │ └── train.sh │ ├── pissa │ └── train.sh │ ├── qlora │ └── train.sh │ ├── reft │ └── train.sh │ └── unsloth │ └── train.sh ├── requirements.txt ├── requirements ├── docs.txt ├── eval.txt ├── framework.txt ├── install_all.sh ├── seq_parallel.txt ├── swanlab.txt └── tests.txt ├── scripts ├── benchmark │ ├── config │ │ └── tuner.json │ ├── exp.py │ ├── exp_utils.py │ └── generate_report.py └── utils │ ├── plot_loss.py │ ├── run_dataset_info.py │ ├── run_model_info.py │ ├── run_template.py │ └── test_link_valid.py ├── setup.cfg ├── setup.py ├── swift ├── __init__.py ├── cli │ ├── __init__.py │ ├── _megatron │ │ ├── __init__.py │ │ ├── main.py │ │ ├── pt.py │ │ └── sft.py │ ├── app.py │ ├── deploy.py │ ├── eval.py │ ├── export.py │ ├── infer.py │ ├── main.py │ ├── merge_lora.py │ ├── pt.py │ ├── rlhf.py │ ├── rollout.py │ ├── sample.py │ ├── sft.py │ └── web_ui.py ├── hub │ ├── __init__.py │ ├── constant.py │ └── hub.py ├── llm │ ├── __init__.py │ ├── app │ │ ├── __init__.py │ │ ├── app.py │ │ ├── build_ui.py │ │ └── locale.py │ ├── argument │ │ ├── __init__.py │ │ ├── app_args.py │ │ ├── base_args │ │ │ ├── __init__.py │ │ │ ├── base_args.py │ │ │ ├── data_args.py │ │ │ ├── generation_args.py │ │ │ ├── model_args.py │ │ │ ├── quant_args.py │ │ │ ├── template_args.py │ │ │ └── utils.py │ │ ├── deploy_args.py │ │ ├── eval_args.py │ │ ├── export_args.py │ │ ├── infer_args.py │ │ ├── merge_args.py │ │ ├── rlhf_args.py │ │ ├── sampling_args.py │ │ ├── train_args.py │ │ ├── tuner_args.py │ │ └── webui_args.py │ ├── base.py │ ├── data_loader.py │ ├── dataset │ │ ├── __init__.py │ │ ├── data │ │ │ └── dataset_info.json │ │ ├── dataset │ │ │ ├── __init__.py │ │ │ ├── llm.py │ │ │ └── mllm.py │ │ ├── loader.py │ │ ├── media.py │ │ ├── preprocessor │ │ │ ├── __init__.py │ │ │ ├── core.py │ │ │ └── extra.py │ │ ├── register.py │ │ └── utils.py │ ├── ds_config │ │ ├── zero0.json │ │ ├── zero1.json │ │ ├── zero2.json │ │ ├── zero2_offload.json │ │ ├── zero3.json │ │ └── zero3_offload.json │ ├── eval │ │ ├── __init__.py │ │ ├── eval.py │ │ └── utils.py │ ├── export │ │ ├── __init__.py │ │ ├── export.py │ │ ├── merge_lora.py │ │ ├── ollama.py │ │ └── quant.py │ ├── infer │ │ ├── __init__.py │ │ ├── deploy.py │ │ ├── infer.py │ │ ├── infer_engine │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── grpo_vllm_engine.py │ │ │ ├── infer_client.py │ │ │ ├── infer_engine.py │ │ │ ├── lmdeploy_engine.py │ │ │ ├── patch.py │ │ │ ├── pt_engine.py │ │ │ ├── utils.py │ │ │ └── vllm_engine.py │ │ ├── protocol.py │ │ ├── rollout.py │ │ └── utils.py │ ├── model │ │ ├── __init__.py │ │ ├── constant.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── baai.py │ │ │ ├── baichuan.py │ │ │ ├── bert.py │ │ │ ├── codefuse.py │ │ │ ├── deepseek.py │ │ │ ├── gemma.py │ │ │ ├── glm.py │ │ │ ├── internlm.py │ │ │ ├── llama.py │ │ │ ├── llava.py │ │ │ ├── llm.py │ │ │ ├── mamba.py │ │ │ ├── microsoft.py │ │ │ ├── minicpm.py │ │ │ ├── minimax.py │ │ │ ├── mistral.py │ │ │ ├── mllm.py │ │ │ ├── moonshot.py │ │ │ ├── mplug.py │ │ │ ├── openbuddy.py │ │ │ ├── qwen.py │ │ │ ├── skywork.py │ │ │ ├── stepfun.py │ │ │ ├── telechat.py │ │ │ ├── valley.py │ │ │ └── yi.py │ │ ├── model_arch.py │ │ ├── patcher.py │ │ ├── register.py │ │ └── utils.py │ ├── sampling │ │ ├── __init__.py │ │ ├── base.py │ │ ├── distill_sampler.py │ │ ├── mcts.py │ │ ├── sampling.py │ │ ├── utils.py │ │ └── vanilla_sampler.py │ ├── template │ │ ├── __init__.py │ │ ├── base.py │ │ ├── constant.py │ │ ├── grounding.py │ │ ├── register.py │ │ ├── template │ │ │ ├── __init__.py │ │ │ ├── deepseek.py │ │ │ ├── emu3.py │ │ │ ├── gemma.py │ │ │ ├── glm.py │ │ │ ├── idefics3.py │ │ │ ├── internlm.py │ │ │ ├── internvl.py │ │ │ ├── llama.py │ │ │ ├── llava.py │ │ │ ├── llm.py │ │ │ ├── megrez.py │ │ │ ├── microsoft.py │ │ │ ├── minicpm.py │ │ │ ├── minimax.py │ │ │ ├── mistral.py │ │ │ ├── molmo.py │ │ │ ├── moonshot.py │ │ │ ├── mplug.py │ │ │ ├── openbuddy.py │ │ │ ├── pixtral.py │ │ │ ├── qwen.py │ │ │ ├── stepfun.py │ │ │ ├── utils.py │ │ │ ├── valley.py │ │ │ └── yi.py │ │ ├── template_inputs.py │ │ ├── template_meta.py │ │ ├── utils.py │ │ └── vision_utils.py │ ├── train │ │ ├── __init__.py │ │ ├── callback.py │ │ ├── kto.py │ │ ├── pt.py │ │ ├── rlhf.py │ │ ├── sft.py │ │ └── tuner.py │ └── utils.py ├── megatron │ ├── __init__.py │ ├── argument │ │ ├── __init__.py │ │ ├── megatron_args.py │ │ └── train_args.py │ ├── init.py │ ├── model │ │ ├── __init__.py │ │ ├── config.py │ │ ├── constant.py │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── hf2mcore.py │ │ │ ├── mcore2hf.py │ │ │ └── model.py │ │ ├── register.py │ │ └── rope.py │ ├── train │ │ ├── __init__.py │ │ ├── patcher.py │ │ ├── pt.py │ │ ├── sft.py │ │ └── utils.py │ └── utils │ │ ├── __init__.py │ │ ├── convert.py │ │ └── patcher.py ├── plugin │ ├── __init__.py │ ├── agent_template │ │ ├── __init__.py │ │ ├── base.py │ │ ├── extra.py │ │ ├── glm4.py │ │ ├── hermes.py │ │ ├── llama.py │ │ ├── qwen.py │ │ ├── react.py │ │ └── toolbench.py │ ├── callback.py │ ├── loss.py │ ├── loss_scale │ │ ├── __init__.py │ │ ├── config │ │ │ ├── agentflan.json │ │ │ ├── alpha_umi.json │ │ │ ├── hermes.json │ │ │ ├── ignore_empty_think.json │ │ │ ├── qwen.json │ │ │ └── react.json │ │ ├── loss_scale.py │ │ └── utils.py │ ├── metric.py │ ├── multi_turn.py │ ├── optimizer.py │ ├── orm.py │ ├── prm.py │ ├── rm_plugin.py │ └── tuner.py ├── trainers │ ├── __init__.py │ ├── arguments.py │ ├── callback.py │ ├── mixin.py │ ├── optimizers │ │ ├── __init__.py │ │ └── galore │ │ │ ├── __init__.py │ │ │ ├── adafactor.py │ │ │ ├── adamw.py │ │ │ ├── adamw8bit.py │ │ │ ├── galore_projector.py │ │ │ └── utils.py │ ├── rlhf_arguments.py │ ├── rlhf_trainer │ │ ├── __init__.py │ │ ├── cpo_trainer.py │ │ ├── dpo_trainer.py │ │ ├── grpo_trainer.py │ │ ├── kto_trainer.py │ │ ├── orpo_trainer.py │ │ ├── ppo_trainer.py │ │ ├── reward_trainer.py │ │ ├── rlhf_mixin.py │ │ ├── utils.py │ │ └── vllm_client.py │ ├── sequence_parallel │ │ ├── __init__.py │ │ ├── base.py │ │ ├── ulysses.py │ │ └── xtuner.py │ ├── torchacc_mixin.py │ ├── trainer_factory.py │ ├── trainers.py │ └── utils.py ├── tuners │ ├── __init__.py │ ├── adapter.py │ ├── base.py │ ├── llamapro.py │ ├── longlora │ │ ├── __init__.py │ │ ├── llama.py │ │ └── longlora.py │ ├── lora.py │ ├── lora_layers.py │ ├── mapping.py │ ├── neftune.py │ ├── part.py │ ├── peft.py │ ├── prompt.py │ ├── reft.py │ ├── restuning.py │ ├── restuning_components.py │ ├── scetuning │ │ ├── __init__.py │ │ ├── scetuning.py │ │ └── scetuning_components.py │ ├── side.py │ └── utils.py ├── ui │ ├── __init__.py │ ├── app.py │ ├── base.py │ ├── llm_eval │ │ ├── __init__.py │ │ ├── eval.py │ │ ├── llm_eval.py │ │ ├── model.py │ │ └── runtime.py │ ├── llm_export │ │ ├── __init__.py │ │ ├── export.py │ │ ├── llm_export.py │ │ ├── model.py │ │ └── runtime.py │ ├── llm_grpo │ │ ├── __init__.py │ │ ├── grpo_advanced.py │ │ ├── llm_grpo.py │ │ ├── model.py │ │ ├── ref_model.py │ │ ├── reward.py │ │ └── rollout.py │ ├── llm_infer │ │ ├── __init__.py │ │ ├── generate.py │ │ ├── llm_infer.py │ │ ├── model.py │ │ └── runtime.py │ └── llm_train │ │ ├── __init__.py │ │ ├── advanced.py │ │ ├── dataset.py │ │ ├── galore.py │ │ ├── hyper.py │ │ ├── lisa.py │ │ ├── llamapro.py │ │ ├── llm_train.py │ │ ├── lora.py │ │ ├── model.py │ │ ├── quantization.py │ │ ├── report_to.py │ │ ├── rlhf.py │ │ ├── runtime.py │ │ ├── save.py │ │ ├── self_cog.py │ │ └── utils.py ├── utils │ ├── __init__.py │ ├── constants.py │ ├── env.py │ ├── import_utils.py │ ├── io_utils.py │ ├── logger.py │ ├── np_utils.py │ ├── tb_utils.py │ ├── torch_utils.py │ ├── torchacc_utils.py │ └── utils.py └── version.py └── tests ├── __init__.py ├── app └── test_app.py ├── deploy ├── test_dataset.py └── test_logprobs.py ├── eval └── test_eval.py ├── export └── test_quant.py ├── general ├── test_arch.py ├── test_dataset.py ├── test_model.py ├── test_stream.py └── test_template.py ├── hub ├── __init__.py └── test_check_model.py ├── infer ├── test_agent.py ├── test_infer.py ├── test_logprobs.py ├── test_main.py ├── test_max_memory.py └── test_mllm.py ├── llm ├── __init__.py ├── config │ ├── infer.json │ └── sft.json ├── data │ ├── alpaca.csv │ ├── alpaca.jsonl │ ├── alpaca2.csv │ ├── chatml.jsonl │ ├── conversations.jsonl │ ├── multi_modal_1.jsonl │ ├── multi_modal_2.jsonl │ ├── multi_modal_3.jsonl │ ├── sharegpt.jsonl │ ├── swift_multi.json │ ├── swift_multi.jsonl │ ├── swift_pre.csv │ ├── swift_pre.jsonl │ ├── swift_single.csv │ └── swift_single.jsonl ├── load_model.py ├── load_template.py ├── test_custom.py ├── test_dataset.py ├── test_ollama_export.py ├── test_run.py ├── test_run3.py ├── test_template.py └── test_utils.py ├── megatron ├── test_align │ └── test_llm.py ├── test_export.py ├── test_model.py ├── test_save.py └── test_train.py ├── model_tag.py ├── models ├── test_flash_attn.py ├── test_llm.py └── test_mllm.py ├── run.py ├── run_config.yaml ├── sample └── test_client.py ├── test_align ├── test_cls.py ├── test_lmdeploy_vlm.py ├── test_padding_side.py ├── test_rlhf_loss.py ├── test_template │ ├── test_agent.py │ ├── test_audio.py │ ├── test_gene.py │ ├── test_llm.py │ ├── test_template.py │ ├── test_tool.py │ ├── test_video.py │ └── test_vision.py └── test_vllm_vlm.py ├── test_utils.py ├── train ├── test_cls.py ├── test_freeze.py ├── test_grounding.py ├── test_grpo.py ├── test_kto.py ├── test_liger.py ├── test_multilabel.py ├── test_packing.py ├── test_ppo.py ├── test_pt.py ├── test_rlhf.py ├── test_sample.py ├── test_sft.py ├── test_train_eval.py └── test_vit_lr.py ├── tuners ├── __init__.py ├── test_extra_state_dict.py ├── test_merged_linear.py ├── test_neft.py ├── test_peft.py ├── test_scetuning.py ├── test_swift_base.py ├── test_swift_device_map.py └── test_swift_restuning.py └── utils ├── __init__.py ├── test_file_utils.py ├── test_io_utils.py ├── test_split_str_parts_by.py └── test_torch_utils.py /.dev_scripts/build_docs.sh: -------------------------------------------------------------------------------- 1 | pip install -r requirements/docs.txt 2 | cd docs 3 | rm -rf build 4 | 5 | # update api rst 6 | #rm -rf source/api/ 7 | #sphinx-apidoc --module-first -o source/api/ ../modelscope/ 8 | make html 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | What the bug is, and how to reproduce, better with screenshots(描述bug以及复现过程,最好有截图) 12 | 13 | 14 | **Your hardware and system info** 15 | Write your system info like CUDA version/system/GPU/torch version here(在这里给出硬件信息和系统信息,如CUDA版本,系统,GPU型号和torch版本等) 16 | 17 | 18 | **Additional context** 19 | Add any other context about the problem here(在这里补充其他信息) 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the feature** 11 | Please describe the feature requested here(请在这里描述需求) 12 | 13 | **Paste any useful information** 14 | Paste any useful information, including papers, github links, etc.(请在这里描述其他有用的信息,比如相关的论文地址,github链接等) 15 | 16 | **Additional context** 17 | Add any other context or information here(其他信息可以写在这里) 18 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # PR type 2 | - [ ] Bug Fix 3 | - [ ] New Feature 4 | - [ ] Document Updates 5 | - [ ] More Models or Datasets Support 6 | 7 | # PR information 8 | 9 | Write the detail information belongs to this PR. 10 | 11 | ## Experiment results 12 | 13 | Paste your experiment result here(if needed). 14 | -------------------------------------------------------------------------------- /.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Reporting Security Issues 2 | 3 | Usually security issues of a deep learning project come from non-standard 3rd packages or continuous running services. If you are suffering from security issues from our project, please consider reporting to us. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions. 4 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint test 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.10 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.10' 18 | - name: Install pre-commit hook 19 | run: | 20 | pip install pre-commit 21 | - name: Linting 22 | run: pre-commit run --all-files 23 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v**' 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }}-publish 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | build-n-publish: 14 | runs-on: ubuntu-22.04 15 | #if: startsWith(github.event.ref, 'refs/tags') 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 3.10 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.10' 22 | - name: Install wheel 23 | run: pip install wheel packaging setuptools==69.5.1 24 | - name: Build ModelScope Swift 25 | run: python setup.py sdist bdist_wheel 26 | - name: Publish package to PyPI 27 | run: | 28 | pip install twine 29 | twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include swift/utils *.py 2 | recursive-include swift/llm/dataset/data *.* 3 | recursive-include swift/llm/ds_config *.json 4 | recursive-include requirements *.txt 5 | recursive-include swift/plugin/loss_scale/config *.json 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | WHL_BUILD_DIR :=package 2 | DOC_BUILD_DIR :=docs/build/ 3 | 4 | # default rule 5 | default: whl docs 6 | 7 | .PHONY: docs 8 | docs: 9 | bash .dev_scripts/build_docs.sh 10 | 11 | .PHONY: linter 12 | linter: 13 | bash .dev_scripts/linter.sh 14 | 15 | .PHONY: test 16 | test: 17 | bash .dev_scripts/citest.sh 18 | 19 | .PHONY: whl 20 | whl: 21 | python setup.py sdist bdist_wheel 22 | 23 | .PHONY: clean 24 | clean: 25 | rm -rf $(WHL_BUILD_DIR) $(DOC_BUILD_DIR) 26 | -------------------------------------------------------------------------------- /asset/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/asset/banner.png -------------------------------------------------------------------------------- /asset/discord_qr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/asset/discord_qr.jpg -------------------------------------------------------------------------------- /asset/wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/asset/wechat.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/resources/dpo_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/dpo_data.png -------------------------------------------------------------------------------- /docs/resources/grpo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo.png -------------------------------------------------------------------------------- /docs/resources/grpo_clevr_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_clevr_count.png -------------------------------------------------------------------------------- /docs/resources/grpo_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_code.png -------------------------------------------------------------------------------- /docs/resources/grpo_countdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_countdown.png -------------------------------------------------------------------------------- /docs/resources/grpo_countdown_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_countdown_1.png -------------------------------------------------------------------------------- /docs/resources/grpo_geoqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_geoqa.png -------------------------------------------------------------------------------- /docs/resources/grpo_openr1_multimodal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_openr1_multimodal.png -------------------------------------------------------------------------------- /docs/resources/kto_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/kto_data.png -------------------------------------------------------------------------------- /docs/resources/web-ui-en.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/web-ui-en.jpg -------------------------------------------------------------------------------- /docs/resources/web-ui.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/web-ui.jpg -------------------------------------------------------------------------------- /docs/source/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | # formats: 20 | # - pdf 21 | # - epub 22 | 23 | # Optional but recommended, declare the Python requirements required 24 | # to build your documentation 25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 26 | python: 27 | install: 28 | - requirements: requirements/docs.txt 29 | - requirements: requirements/framework.txt 30 | -------------------------------------------------------------------------------- /docs/source/BestPractices/更多最佳实践.md: -------------------------------------------------------------------------------- 1 | 2 | # 更多最佳实践 3 | 4 | - [Qwen2.5自我认知微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition) 5 | - [Qwen2-VL Latex-OCR微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2vl-ocr) 6 | - [Qwen2.5-VL Grounding任务微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-vl-grounding) 7 | - [Qwen3全流程最佳实践](https://github.com/modelscope/ms-swift/issues/4030) 8 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: {{ module }} 2 | 3 | 4 | {{ name | underline}} 5 | 6 | .. autoclass:: {{ name }} 7 | :inherited-members: 8 | :members: 9 | 10 | .. autogenerated from source/_templates/autosummary/class.rst 11 | -------------------------------------------------------------------------------- /docs/source/_templates/classtemplate.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: {{ module }} 2 | 3 | 4 | {{ name | underline}} 5 | 6 | .. autoclass:: {{ name }} 7 | :members: 8 | :special-members: __init__, __call__ 9 | 10 | .. 11 | autogenerated from source/_templates/classtemplate.rst 12 | note it does not have :inherited-members: 13 | -------------------------------------------------------------------------------- /docs/source/_templates/sobolengine.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: {{ module }} 2 | 3 | 4 | {{ name | underline}} 5 | 6 | .. autoclass:: {{ name }} 7 | :members: 8 | :exclude-members: MAXBIT, MAXDIM 9 | :undoc-members: 10 | 11 | 12 | .. 13 | autogenerated from source/_templates/sobolengine.rst 14 | note it has specific options 15 | -------------------------------------------------------------------------------- /docs/source_en/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.12" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: docs/source_en/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | # formats: 20 | # - pdf 21 | # - epub 22 | 23 | # Optional but recommended, declare the Python requirements required 24 | # to build your documentation 25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 26 | python: 27 | install: 28 | - requirements: requirements/docs.txt 29 | - requirements: requirements/framework.txt 30 | -------------------------------------------------------------------------------- /docs/source_en/BestPractices/More-Best-Practices.md: -------------------------------------------------------------------------------- 1 | 2 | # More Best Practices 3 | 4 | - [Qwen2.5 self-cognition SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition) 5 | - [Qwen2-VL Latex-OCR SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2vl-ocr) 6 | - [Qwen2.5-VL Grounding Task SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-vl-grounding) 7 | - [Qwen3全流程最佳实践](https://github.com/modelscope/ms-swift/issues/4030) 8 | -------------------------------------------------------------------------------- /docs/source_en/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: {{ module }} 2 | 3 | 4 | {{ name | underline}} 5 | 6 | .. autoclass:: {{ name }} 7 | :inherited-members: 8 | :members: 9 | 10 | .. autogenerated from source/_templates/autosummary/class.rst 11 | -------------------------------------------------------------------------------- /docs/source_en/_templates/classtemplate.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: {{ module }} 2 | 3 | 4 | {{ name | underline}} 5 | 6 | .. autoclass:: {{ name }} 7 | :members: 8 | :special-members: __init__, __call__ 9 | 10 | .. 11 | autogenerated from source/_templates/classtemplate.rst 12 | note it does not have :inherited-members: 13 | -------------------------------------------------------------------------------- /docs/source_en/_templates/sobolengine.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: {{ module }} 2 | 3 | 4 | {{ name | underline}} 5 | 6 | .. autoclass:: {{ name }} 7 | :members: 8 | :exclude-members: MAXBIT, MAXDIM 9 | :undoc-members: 10 | 11 | 12 | .. 13 | autogenerated from source/_templates/sobolengine.rst 14 | note it has specific options 15 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Instructions 2 | 3 | The example provides instructions for using SWIFT for training, inference, deployment, evaluation, and quantization. By default, the model will be downloaded from the ModelScope community. 4 | 5 | If you want to use the Huggingface community, you can change the command line like this: 6 | 7 | ```shell 8 | ... 9 | swift sft \ 10 | --model \ 11 | --use_hf 1 \ 12 | ... 13 | ``` 14 | -------------------------------------------------------------------------------- /examples/app/base_url/demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | import os 3 | 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 5 | 6 | if __name__ == '__main__': 7 | from swift.llm import AppArguments, app_main, DeployArguments, run_deploy 8 | # Here's a runnable demo provided. 9 | # In a real scenario, you can simply remove the deployed context. 10 | with run_deploy( 11 | DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'), 12 | return_url=True) as url: 13 | app_main(AppArguments(model='Qwen2.5-1.5B-Instruct', base_url=url, stream=True, max_new_tokens=2048)) 14 | -------------------------------------------------------------------------------- /examples/app/base_url/demo.sh: -------------------------------------------------------------------------------- 1 | # You need to have a deployed model or api service first 2 | CUDA_VISIBLE_DEVICES=0 swift app \ 3 | --model '' \ 4 | --base_url http://127.0.0.1:8000/v1 \ 5 | --stream true \ 6 | --max_new_tokens 2048 \ 7 | --lang zh 8 | -------------------------------------------------------------------------------- /examples/app/llm.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 swift app \ 2 | --model Qwen/Qwen2.5-7B-Instruct \ 3 | --stream true \ 4 | --infer_backend vllm \ 5 | --max_new_tokens 2048 \ 6 | --gpu_memory_utilization 0.9 \ 7 | --max_model_len 8192 \ 8 | --lang zh 9 | -------------------------------------------------------------------------------- /examples/app/mllm.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | MAX_PIXELS=1003520 \ 3 | VIDEO_MAX_PIXELS=50176 \ 4 | FPS_MAX_FRAMES=12 \ 5 | swift app \ 6 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 7 | --stream true \ 8 | --infer_backend vllm \ 9 | --gpu_memory_utilization 0.9 \ 10 | --max_model_len 8192 \ 11 | --max_new_tokens 2048 \ 12 | --limit_mm_per_prompt '{"image": 5, "video": 2}' \ 13 | --lang zh 14 | -------------------------------------------------------------------------------- /examples/custom/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from typing import Any, Dict, Optional 3 | 4 | from swift.llm import DatasetMeta, ResponsePreprocessor, load_dataset, register_dataset 5 | 6 | 7 | class CustomPreprocessor(ResponsePreprocessor): 8 | prompt = """Task: Based on the given two sentences, provide a similarity score between 0.0 and 5.0. 9 | Sentence 1: {text1} 10 | Sentence 2: {text2} 11 | Similarity score: """ 12 | 13 | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: 14 | return super().preprocess({ 15 | 'query': self.prompt.format(text1=row['text1'], text2=row['text2']), 16 | 'response': f"{row['label']:.1f}" 17 | }) 18 | 19 | 20 | register_dataset( 21 | DatasetMeta( 22 | ms_dataset_id='swift/stsb', 23 | hf_dataset_id='SetFit/stsb', 24 | preprocess_func=CustomPreprocessor(), 25 | )) 26 | 27 | if __name__ == '__main__': 28 | dataset = load_dataset(['swift/stsb'])[0] 29 | print(f'dataset: {dataset}') 30 | print(f'dataset[0]: {dataset[0]}') 31 | -------------------------------------------------------------------------------- /examples/custom/infer.sh: -------------------------------------------------------------------------------- 1 | # sh examples/custom/infer.sh 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift infer \ 4 | --adapters output/vx-xxx/checkpoint-xxx \ 5 | --load_data_args true \ 6 | --infer_backend pt \ 7 | --max_batch_size 16 \ 8 | --max_new_tokens 256 \ 9 | --temperature 0 10 | -------------------------------------------------------------------------------- /examples/custom/sft.sh: -------------------------------------------------------------------------------- 1 | # sh examples/custom/sft.sh 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --custom_register_path examples/custom/dataset.py \ 5 | examples/custom/model.py \ 6 | --model AI-ModelScope/Nemotron-Mini-4B-Instruct \ 7 | --train_type lora \ 8 | --dataset swift/stsb \ 9 | --num_train_epochs 3 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 1 \ 12 | --learning_rate 1e-4 \ 13 | --lora_rank 8 \ 14 | --lora_alpha 32 \ 15 | --target_modules all-linear \ 16 | --gradient_accumulation_steps 16 \ 17 | --eval_steps 100 \ 18 | --save_steps 100 \ 19 | --save_total_limit 2 \ 20 | --logging_steps 5 \ 21 | --warmup_ratio 0.05 \ 22 | --dataloader_num_workers 4 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --dataset_num_proc 4 26 | -------------------------------------------------------------------------------- /examples/deploy/agent/server.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 swift deploy \ 2 | --model Qwen/Qwen2.5-7B-Instruct \ 3 | --infer_backend vllm \ 4 | --gpu_memory_utilization 0.9 \ 5 | --max_model_len 8192 \ 6 | --max_new_tokens 2048 \ 7 | --agent_template hermes \ 8 | --served_model_name Qwen2.5-7B-Instruct 9 | -------------------------------------------------------------------------------- /examples/deploy/bert/client.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from swift.llm import InferClient, InferRequest 4 | 5 | 6 | def infer_batch(engine: InferClient, infer_requests: List[InferRequest]): 7 | resp_list = engine.infer(infer_requests) 8 | query0 = infer_requests[0].messages[0]['content'] 9 | query1 = infer_requests[1].messages[0]['content'] 10 | print(f'query0: {query0}') 11 | print(f'response0: {resp_list[0].choices[0].message.content}') 12 | print(f'query1: {query1}') 13 | print(f'response1: {resp_list[1].choices[0].message.content}') 14 | 15 | 16 | if __name__ == '__main__': 17 | engine = InferClient(host='127.0.0.1', port=8000) 18 | models = engine.models 19 | print(f'models: {models}') 20 | infer_batch(engine, [ 21 | InferRequest(messages=[{ 22 | 'role': 'user', 23 | 'content': '今天天气真好呀' 24 | }]), 25 | InferRequest(messages=[{ 26 | 'role': 'user', 27 | 'content': '真倒霉' 28 | }]) 29 | ]) 30 | -------------------------------------------------------------------------------- /examples/deploy/bert/server.sh: -------------------------------------------------------------------------------- 1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file, 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read. 3 | CUDA_VISIBLE_DEVICES=0 swift deploy \ 4 | --host 0.0.0.0 \ 5 | --port 8000 \ 6 | --adapters swift/test_bert \ 7 | --served_model_name bert-base-chinese \ 8 | --infer_backend pt \ 9 | --truncation_strategy right \ 10 | --max_length 512 11 | -------------------------------------------------------------------------------- /examples/deploy/lora/client.py: -------------------------------------------------------------------------------- 1 | from swift.llm import InferClient, InferRequest, RequestConfig 2 | 3 | 4 | def infer_multilora(engine: InferClient, infer_request: InferRequest): 5 | # Dynamic LoRA 6 | models = engine.models 7 | print(f'models: {models}') 8 | request_config = RequestConfig(max_tokens=512, temperature=0) 9 | 10 | # use lora1 11 | resp_list = engine.infer([infer_request], request_config, model=models[1]) 12 | response = resp_list[0].choices[0].message.content 13 | print(f'lora1-response: {response}') 14 | # origin model 15 | resp_list = engine.infer([infer_request], request_config, model=models[0]) 16 | response = resp_list[0].choices[0].message.content 17 | print(f'response: {response}') 18 | # use lora2 19 | resp_list = engine.infer([infer_request], request_config, model=models[2]) 20 | response = resp_list[0].choices[0].message.content 21 | print(f'lora2-response: {response}') 22 | 23 | 24 | if __name__ == '__main__': 25 | engine = InferClient(host='127.0.0.1', port=8000) 26 | infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]) 27 | infer_multilora(engine, infer_request) 28 | -------------------------------------------------------------------------------- /examples/deploy/lora/server.sh: -------------------------------------------------------------------------------- 1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file, 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read. 3 | CUDA_VISIBLE_DEVICES=0 swift deploy \ 4 | --host 0.0.0.0 \ 5 | --port 8000 \ 6 | --adapters lora1=swift/test_lora lora2=swift/test_lora2 \ 7 | --infer_backend vllm 8 | -------------------------------------------------------------------------------- /examples/deploy/reward_model/client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import InferClient, InferRequest 3 | 4 | if __name__ == '__main__': 5 | engine = InferClient(host='127.0.0.1', port=8000) 6 | models = engine.models 7 | print(f'models: {models}') 8 | messages = [{ 9 | 'role': 'user', 10 | 'content': "Hello! What's your name?" 11 | }, { 12 | 'role': 'assistant', 13 | 'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?' 14 | }] 15 | resp_list = engine.infer([InferRequest(messages=messages)]) 16 | print(f'messages: {messages}') 17 | print(f'response: {resp_list[0].choices[0].message.content}') 18 | -------------------------------------------------------------------------------- /examples/deploy/reward_model/server.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 swift deploy \ 2 | --host 0.0.0.0 \ 3 | --port 8000 \ 4 | --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \ 5 | --infer_backend pt 6 | -------------------------------------------------------------------------------- /examples/deploy/server/README.md: -------------------------------------------------------------------------------- 1 | Please refer to the examples in [examples/infer](../../infer/) and change `swift infer` to `swift deploy` to start the service. (You need to additionally remove `--val_dataset`) 2 | 3 | e.g. 4 | ```shell 5 | CUDA_VISIBLE_DEVICES=0 \ 6 | swift deploy \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --infer_backend vllm 9 | ``` 10 | -------------------------------------------------------------------------------- /examples/deploy/server/demo.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 swift deploy \ 2 | --model Qwen/Qwen2.5-7B-Instruct \ 3 | --infer_backend vllm \ 4 | --served_model_name Qwen2.5-7B-Instruct 5 | 6 | # After the server-side deployment above is successful, use the command below to perform a client call test. 7 | 8 | # curl http://localhost:8000/v1/chat/completions \ 9 | # -H "Content-Type: application/json" \ 10 | # -d '{ 11 | # "model": "Qwen2.5-7B-Instruct", 12 | # "messages": [{"role": "user", "content": "What is your name?"}], 13 | # "temperature": 0 14 | # }' 15 | -------------------------------------------------------------------------------- /examples/eval/eval_url/demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | import os 3 | 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 5 | 6 | if __name__ == '__main__': 7 | from swift.llm import EvalArguments, eval_main, run_deploy, DeployArguments 8 | # Here's a runnable demo provided. Use the eval_url method for evaluation. 9 | # In a real scenario, you can simply remove the deployed context. 10 | print(EvalArguments.list_eval_dataset()) 11 | with run_deploy( 12 | DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'), 13 | return_url=True) as url: 14 | eval_main(EvalArguments(model='Qwen2.5-1.5B-Instruct', eval_url=url, eval_dataset=['arc'])) 15 | -------------------------------------------------------------------------------- /examples/eval/eval_url/eval.sh: -------------------------------------------------------------------------------- 1 | # You need to have a deployed model or api service first 2 | swift eval \ 3 | --model '' \ 4 | --eval_backend OpenCompass \ 5 | --eval_url http://127.0.0.1:8000/v1 \ 6 | --eval_limit 100 \ 7 | --eval_dataset gsm8k 8 | -------------------------------------------------------------------------------- /examples/eval/llm/eval.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift eval \ 3 | --model Qwen/Qwen2.5-1.5B-Instruct \ 4 | --eval_backend OpenCompass \ 5 | --infer_backend vllm \ 6 | --eval_limit 100 \ 7 | --eval_dataset gsm8k 8 | -------------------------------------------------------------------------------- /examples/eval/train_eval/train.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model "Qwen/Qwen2.5-0.5B-Instruct" \ 4 | --train_type "lora" \ 5 | --dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \ 6 | --torch_dtype "bfloat16" \ 7 | --num_train_epochs "1" \ 8 | --per_device_train_batch_size "1" \ 9 | --learning_rate "1e-4" \ 10 | --lora_rank "8" \ 11 | --lora_alpha "32" \ 12 | --target_modules "all-linear" \ 13 | --gradient_accumulation_steps "16" \ 14 | --save_steps "50" \ 15 | --save_total_limit "5" \ 16 | --logging_steps "5" \ 17 | --max_length "2048" \ 18 | --eval_strategy "steps" \ 19 | --eval_steps "5" \ 20 | --per_device_eval_batch_size "5" \ 21 | --eval_use_evalscope \ 22 | --eval_datasets "gsm8k" \ 23 | --eval_datasets_args '{"gsm8k": {"few_shot_num": 0}}' \ 24 | --eval_limit "10" 25 | -------------------------------------------------------------------------------- /examples/eval/vlm/eval.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | MAX_PIXELS=1003520 \ 3 | swift eval \ 4 | --model Qwen/Qwen2-VL-2B-Instruct \ 5 | --infer_backend vllm \ 6 | --eval_limit 100 \ 7 | --eval_dataset realWorldQA \ 8 | --eval_backend VLMEvalKit 9 | -------------------------------------------------------------------------------- /examples/export/merge_lora.sh: -------------------------------------------------------------------------------- 1 | # Since `output/vx-xxx/checkpoint-xxx` is trained by swift and contains an `args.json` file, 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read. 3 | swift export \ 4 | --adapters output/vx-xxx/checkpoint-xxx \ 5 | --merge_lora true 6 | -------------------------------------------------------------------------------- /examples/export/ollama.sh: -------------------------------------------------------------------------------- 1 | swift export \ 2 | --model Qwen/Qwen2.5-1.5B-Instruct \ 3 | --to_ollama true \ 4 | --output_dir Qwen2.5-1.5B-Instruct-ollama 5 | -------------------------------------------------------------------------------- /examples/export/push_to_hub.sh: -------------------------------------------------------------------------------- 1 | swift export \ 2 | --adapters output/vx-xxx/checkpoint-xxx \ 3 | --push_to_hub true \ 4 | --hub_model_id '' \ 5 | --hub_token '' \ 6 | --use_hf false 7 | -------------------------------------------------------------------------------- /examples/export/quantize/awq.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift export \ 3 | --model Qwen/Qwen2.5-72B-Instruct \ 4 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 5 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 6 | --device_map cpu \ 7 | --quant_n_samples 256 \ 8 | --quant_batch_size 1 \ 9 | --max_length 2048 \ 10 | --quant_method awq \ 11 | --quant_bits 4 \ 12 | --output_dir Qwen2.5-72B-Instruct-AWQ 13 | -------------------------------------------------------------------------------- /examples/export/quantize/bert/bnb.sh: -------------------------------------------------------------------------------- 1 | # merge-lora 2 | CUDA_VISIBLE_DEVICES=0 swift export \ 3 | --adapters swift/test_bert \ 4 | --output_dir output/swift_test_bert_merged \ 5 | --merge_lora true 6 | 7 | # bnb quantize 8 | CUDA_VISIBLE_DEVICES=0 swift export \ 9 | --model output/swift_test_bert_merged \ 10 | --output_dir output/swift_test_bert_bnb_int4 \ 11 | --quant_bits 4 \ 12 | --quant_method bnb 13 | 14 | # infer 15 | CUDA_VISIBLE_DEVICES=0 swift infer \ 16 | --model output/swift_test_bert_bnb_int4 17 | -------------------------------------------------------------------------------- /examples/export/quantize/bert/gptq.sh: -------------------------------------------------------------------------------- 1 | # merge-lora 2 | CUDA_VISIBLE_DEVICES=0 swift export \ 3 | --adapters swift/test_bert \ 4 | --output_dir output/swift_test_bert_merged \ 5 | --merge_lora true 6 | 7 | # gptq quantize 8 | CUDA_VISIBLE_DEVICES=0 swift export \ 9 | --model output/swift_test_bert_merged \ 10 | --load_data_args true \ 11 | --output_dir output/swift_test_bert_gptq_int4 \ 12 | --quant_bits 4 \ 13 | --quant_method gptq \ 14 | --max_length 512 15 | 16 | # infer 17 | CUDA_VISIBLE_DEVICES=0 swift infer \ 18 | --model output/swift_test_bert_gptq_int4 19 | -------------------------------------------------------------------------------- /examples/export/quantize/bnb.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift export \ 3 | --model Qwen/Qwen2.5-1.5B-Instruct \ 4 | --quant_method bnb \ 5 | --quant_bits 4 \ 6 | --bnb_4bit_quant_type nf4 \ 7 | --bnb_4bit_use_double_quant true \ 8 | --output_dir Qwen2.5-1.5B-Instruct-BNB-NF4 9 | -------------------------------------------------------------------------------- /examples/export/quantize/gptq.sh: -------------------------------------------------------------------------------- 1 | # OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439 2 | OMP_NUM_THREADS=14 \ 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift export \ 5 | --model Qwen/Qwen2.5-1.5B-Instruct \ 6 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 7 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 8 | --quant_n_samples 256 \ 9 | --quant_batch_size 1 \ 10 | --max_length 2048 \ 11 | --quant_method gptq \ 12 | --quant_bits 4 \ 13 | --output_dir Qwen2.5-1.5B-Instruct-GPTQ-Int4 14 | -------------------------------------------------------------------------------- /examples/export/quantize/mllm/awq.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | MAX_PIXELS=1003520 \ 3 | VIDEO_MAX_PIXELS=50176 \ 4 | FPS_MAX_FRAMES=12 \ 5 | swift export \ 6 | --model Qwen/Qwen2.5-VL-3B-Instruct \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 9 | 'modelscope/coco_2014_caption:validation#500' \ 10 | 'swift/VideoChatGPT:Generic#500' \ 11 | --quant_n_samples 256 \ 12 | --quant_batch_size -1 \ 13 | --max_length 2048 \ 14 | --quant_method awq \ 15 | --quant_bits 4 \ 16 | --output_dir Qwen2.5-VL-3B-Instruct-AWQ 17 | -------------------------------------------------------------------------------- /examples/export/quantize/mllm/gptq.sh: -------------------------------------------------------------------------------- 1 | # OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439 2 | OMP_NUM_THREADS=14 \ 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | MAX_PIXELS=1003520 \ 5 | VIDEO_MAX_PIXELS=50176 \ 6 | FPS_MAX_FRAMES=12 \ 7 | swift export \ 8 | --model Qwen/Qwen2.5-VL-3B-Instruct \ 9 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 10 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 11 | 'modelscope/coco_2014_caption:validation#500' \ 12 | 'swift/VideoChatGPT:Generic#500' \ 13 | --quant_n_samples 256 \ 14 | --quant_batch_size 1 \ 15 | --max_length 2048 \ 16 | --quant_method gptq \ 17 | --quant_bits 4 \ 18 | --output_dir Qwen2.5-VL-3B-Instruct-GPTQ-Int4 19 | -------------------------------------------------------------------------------- /examples/export/quantize/moe/awq.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 \ 2 | swift export \ 3 | --model Qwen/Qwen3-30B-A3B \ 4 | --dataset 'swift/Qwen3-SFT-Mixin' \ 5 | --device_map auto \ 6 | --quant_n_samples 64 \ 7 | --quant_batch_size -1 \ 8 | --max_length 8192 \ 9 | --quant_method awq \ 10 | --quant_bits 4 \ 11 | --output_dir Qwen3-30B-A3B-AWQ 12 | -------------------------------------------------------------------------------- /examples/export/quantize/moe/gptq.sh: -------------------------------------------------------------------------------- 1 | # 2 * 80GB 2 | OMP_NUM_THREADS=14 \ 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | swift export \ 5 | --model Qwen/Qwen2-57B-A14B-Instruct \ 6 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \ 7 | 'AI-ModelScope/alpaca-gpt4-data-en#1000' \ 8 | --quant_n_samples 512 \ 9 | --quant_batch_size 1 \ 10 | --max_length 4096 \ 11 | --quant_method gptq \ 12 | --quant_bits 4 \ 13 | --output_dir Qwen2-57B-A14B-Instruct-GPTQ-Int4 14 | -------------------------------------------------------------------------------- /examples/export/quantize/omni/gptq.sh: -------------------------------------------------------------------------------- 1 | # OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439 2 | OMP_NUM_THREADS=14 \ 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | MAX_PIXELS=1003520 \ 5 | VIDEO_MAX_PIXELS=50176 \ 6 | FPS_MAX_FRAMES=12 \ 7 | swift export \ 8 | --model Qwen/Qwen2.5-Omni-7B \ 9 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 10 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 11 | 'modelscope/coco_2014_caption:validation#500' \ 12 | 'swift/VideoChatGPT:Generic#500' \ 13 | --quant_n_samples 256 \ 14 | --quant_batch_size 1 \ 15 | --max_length 2048 \ 16 | --quant_method gptq \ 17 | --quant_bits 4 \ 18 | --output_dir Qwen2.5-Omni-7B-GPTQ-Int4 19 | -------------------------------------------------------------------------------- /examples/export/quantize/reward_model/bnb.sh: -------------------------------------------------------------------------------- 1 | # bnb quantize 2 | CUDA_VISIBLE_DEVICES=0 swift export \ 3 | --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \ 4 | --output_dir output/internlm2-1_8b-reward-bnb-int4 \ 5 | --quant_bits 4 \ 6 | --quant_method bnb 7 | 8 | # infer 9 | CUDA_VISIBLE_DEVICES=0 swift infer \ 10 | --model output/internlm2-1_8b-reward-bnb-int4 \ 11 | --val_dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \ 12 | --max_batch_size 16 13 | -------------------------------------------------------------------------------- /examples/export/quantize/reward_model/gptq.sh: -------------------------------------------------------------------------------- 1 | # gptq quantize 2 | CUDA_VISIBLE_DEVICES=0 swift export \ 3 | --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \ 4 | --output_dir output/internlm2-1_8b-reward-gptq-int4 \ 5 | --quant_bits 4 \ 6 | --quant_method gptq \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' 'AI-ModelScope/alpaca-gpt4-data-en#1000' 8 | 9 | # infer 10 | CUDA_VISIBLE_DEVICES=0 swift infer \ 11 | --model output/internlm2-1_8b-reward-gptq-int4 \ 12 | --val_dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \ 13 | --max_batch_size 16 14 | -------------------------------------------------------------------------------- /examples/infer/cli_demo.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift infer \ 3 | --model Qwen/Qwen2.5-1.5B-Instruct \ 4 | --infer_backend pt \ 5 | --stream true \ 6 | --max_new_tokens 2048 7 | -------------------------------------------------------------------------------- /examples/infer/lmdeploy/mllm_tp.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 \ 2 | swift infer \ 3 | --model OpenGVLab/InternVL2_5-1B \ 4 | --infer_backend lmdeploy \ 5 | --val_dataset AI-ModelScope/captcha-images#1000 \ 6 | --tp 2 \ 7 | --vision_batch_size 8 \ 8 | --max_new_tokens 2048 9 | -------------------------------------------------------------------------------- /examples/infer/pt/batch_ddp.sh: -------------------------------------------------------------------------------- 1 | # 18GB 2 | NPROC_PER_NODE=4 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 4 | swift infer \ 5 | --model Qwen/Qwen2.5-1.5B-Instruct \ 6 | --infer_backend pt \ 7 | --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#1000 \ 8 | --max_batch_size 16 \ 9 | --max_new_tokens 512 10 | -------------------------------------------------------------------------------- /examples/infer/pt/bert.sh: -------------------------------------------------------------------------------- 1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file, 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read. 3 | # To disable this behavior, please set `--load_args false`. 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift infer \ 6 | --adapters swift/test_bert \ 7 | --truncation_strategy right \ 8 | --max_length 512 9 | -------------------------------------------------------------------------------- /examples/infer/pt/lora.sh: -------------------------------------------------------------------------------- 1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file, 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read. 3 | # To disable this behavior, please set `--load_args false`. 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift infer \ 6 | --adapters swift/test_lora \ 7 | --infer_backend pt \ 8 | --stream true \ 9 | --temperature 0 \ 10 | --max_new_tokens 2048 11 | -------------------------------------------------------------------------------- /examples/infer/pt/mllm_device_map.sh: -------------------------------------------------------------------------------- 1 | NPROC_PER_NODE=2 \ 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 3 | MAX_PIXELS=1003520 \ 4 | swift infer \ 5 | --model Qwen/Qwen2.5-VL-3B-Instruct \ 6 | --infer_backend pt \ 7 | --val_dataset AI-ModelScope/LaTeX_OCR#1000 \ 8 | --max_batch_size 16 \ 9 | --max_new_tokens 512 10 | -------------------------------------------------------------------------------- /examples/infer/pt/prm.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift infer \ 3 | --model Qwen/Qwen2.5-Math-PRM-7B \ 4 | --infer_backend pt 5 | -------------------------------------------------------------------------------- /examples/infer/pt/reward_model.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift infer \ 3 | --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \ 4 | --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#1000 \ 5 | --max_batch_size 64 6 | -------------------------------------------------------------------------------- /examples/infer/vllm/dp_tp.sh: -------------------------------------------------------------------------------- 1 | NPROC_PER_NODE=4 \ 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 3 | swift infer \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --infer_backend vllm \ 6 | --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#2000 \ 7 | --gpu_memory_utilization 0.9 \ 8 | --max_model_len 8192 \ 9 | --tensor_parallel_size 2 \ 10 | --max_new_tokens 2048 \ 11 | --write_batch_size 1000 12 | -------------------------------------------------------------------------------- /examples/infer/vllm/mllm_ddp.sh: -------------------------------------------------------------------------------- 1 | # You need to use flash-attn (manual installation) instead of xformers. 2 | NPROC_PER_NODE=2 \ 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | swift infer \ 5 | --model Qwen/Qwen2-Audio-7B-Instruct \ 6 | --infer_backend vllm \ 7 | --val_dataset speech_asr/speech_asr_aishell1_trainsets:validation#1000 \ 8 | --gpu_memory_utilization 0.9 \ 9 | --max_model_len 8192 \ 10 | --max_new_tokens 2048 \ 11 | --limit_mm_per_prompt '{"audio": 5}' 12 | -------------------------------------------------------------------------------- /examples/infer/vllm/mllm_tp.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 \ 2 | MAX_PIXELS=1003520 \ 3 | swift infer \ 4 | --model Qwen/Qwen2.5-VL-3B-Instruct \ 5 | --infer_backend vllm \ 6 | --val_dataset AI-ModelScope/LaTeX_OCR#1000 \ 7 | --gpu_memory_utilization 0.9 \ 8 | --tensor_parallel_size 2 \ 9 | --max_model_len 32768 \ 10 | --max_new_tokens 2048 \ 11 | --limit_mm_per_prompt '{"image": 5, "video": 2}' 12 | -------------------------------------------------------------------------------- /examples/notebook/qwen2_5-self-cognition/infer.sh: -------------------------------------------------------------------------------- 1 | # Here is the command-line style inference code. 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift infer \ 4 | --adapters output/vx-xxx/checkpoint-xxx \ 5 | --stream true \ 6 | --temperature 0 \ 7 | --max_new_tokens 2048 8 | -------------------------------------------------------------------------------- /examples/notebook/qwen2_5-self-cognition/sft.sh: -------------------------------------------------------------------------------- 1 | # Here is the command-line style training code. 2 | # 22GB 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-3B-Instruct \ 6 | --train_type lora \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 9 | 'swift/self-cognition#500' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps 16 \ 19 | --eval_steps 50 \ 20 | --save_steps 50 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --system 'You are a helpful assistant.' \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --dataset_num_proc 4 \ 29 | --model_name 小黄 'Xiao Huang' \ 30 | --model_author '魔搭' 'ModelScope' 31 | -------------------------------------------------------------------------------- /examples/sampler/distill/distill.sh: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY="xxx" \ 2 | swift sample \ 3 | --sampler_type distill \ 4 | --sampler_engine client \ 5 | --model deepseek-r1 \ 6 | --stream true \ 7 | --dataset tastelikefeet/competition_math#5 \ 8 | --num_return_sequences 1 \ 9 | --temperature 0.6 \ 10 | --top_p 0.95 \ 11 | --engine_kwargs '{"base_url":"https://dashscope.aliyuncs.com/compatible-mode/v1"}' 12 | -------------------------------------------------------------------------------- /examples/sampler/mcts/system_prompt.txt: -------------------------------------------------------------------------------- 1 | You are a math model, you should **think step by step** carefully. Each step should **end with \"ки\”**. Final answer should be in a ‘\boxed()’. 2 | 3 | ## Example: 4 | Step1: XXX. ки\n 5 | Step2: XXX. ки\n 6 | Step3: XXX. ки\n 7 | Answer: \boxed(answer). ки\n 8 | -------------------------------------------------------------------------------- /examples/train/agent/deepseek_r1.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ 4 | --train_type full \ 5 | --dataset AI-ModelScope/function-calling-chatml \ 6 | --agent_template react_en \ 7 | --loss_scale react \ 8 | --response_prefix '' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 2 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 8 \ 15 | --eval_steps 100 \ 16 | --save_steps 100 \ 17 | --save_total_limit 2 \ 18 | --logging_steps 5 \ 19 | --max_length 8192 \ 20 | --save_only_model true \ 21 | --packing true \ 22 | --use_liger_kernel true \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --attn_impl flash_attn \ 26 | --dataloader_num_workers 4 \ 27 | --dataset_num_proc 16 28 | -------------------------------------------------------------------------------- /examples/train/agent/glm4.sh: -------------------------------------------------------------------------------- 1 | # 4 * 80GiB 2 | NPROC_PER_NODE=4 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 4 | swift sft \ 5 | --model ZhipuAI/GLM-4-9B-0414 \ 6 | --train_type full \ 7 | --dataset AI-ModelScope/function-calling-chatml \ 8 | --agent_template hermes \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 2 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 2 \ 15 | --eval_steps 100 \ 16 | --save_steps 100 \ 17 | --save_total_limit 2 \ 18 | --logging_steps 5 \ 19 | --max_length 8192 \ 20 | --save_only_model true \ 21 | --packing true \ 22 | --deepspeed zero3 \ 23 | --use_liger_kernel true \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --attn_impl flash_attn \ 27 | --dataloader_num_workers 4 \ 28 | --dataset_num_proc 16 29 | -------------------------------------------------------------------------------- /examples/train/agent/loss_scale/train.sh: -------------------------------------------------------------------------------- 1 | # 20GB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-3B \ 5 | --train_type lora \ 6 | --dataset AI-ModelScope/function-calling-chatml#10000 \ 7 | --loss_scale hermes \ 8 | --agent_template hermes \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 2 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --modules_to_save embed_tokens lm_head \ 18 | --gradient_accumulation_steps 16 \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --use_liger_kernel true \ 25 | --output_dir output \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --dataset_num_proc 16 29 | -------------------------------------------------------------------------------- /examples/train/agent/qwen2_5.sh: -------------------------------------------------------------------------------- 1 | # 35GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-3B \ 5 | --train_type full \ 6 | --dataset AI-ModelScope/function-calling-chatml \ 7 | --agent_template hermes \ 8 | --torch_dtype bfloat16 \ 9 | --num_train_epochs 2 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 1 \ 12 | --learning_rate 1e-5 \ 13 | --gradient_accumulation_steps 8 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --max_length 8192 \ 19 | --save_only_model true \ 20 | --packing true \ 21 | --use_liger_kernel true \ 22 | --output_dir output \ 23 | --warmup_ratio 0.05 \ 24 | --attn_impl flash_attn \ 25 | --dataloader_num_workers 4 \ 26 | --dataset_num_proc 16 27 | -------------------------------------------------------------------------------- /examples/train/all_to_all/infer.sh: -------------------------------------------------------------------------------- 1 | # 53GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift infer \ 4 | --model BAAI/Emu3-Gen \ 5 | --infer_backend pt \ 6 | --stream False \ 7 | --use_chat_template False \ 8 | --top_k 2048 \ 9 | --max_new_tokens 40960 10 | -------------------------------------------------------------------------------- /examples/train/all_to_all/train.sh: -------------------------------------------------------------------------------- 1 | # 70 GiB * 2 2 | nproc_per_node=2 3 | NPROC_PER_NODE=$nproc_per_node \ 4 | CUDA_VISIBLE_DEVICES=0,2 \ 5 | max_position_embeddings=10240 \ 6 | image_area=518400 \ 7 | swift sft \ 8 | --model BAAI/Emu3-Gen \ 9 | --train_type lora \ 10 | --dataset 'swift/TextCaps#40' \ 11 | --torch_dtype bfloat16 \ 12 | --num_train_epochs 10 \ 13 | --per_device_train_batch_size 1 \ 14 | --learning_rate 1e-5 \ 15 | --gradient_accumulation_steps 4 \ 16 | --warmup_ratio 0.03 \ 17 | --eval_steps 500 \ 18 | --save_steps 500 \ 19 | --save_total_limit 2 \ 20 | --logging_steps 5 \ 21 | --max_length 1024 \ 22 | --weight_decay 0.1 \ 23 | --gradient_checkpointing_kwargs '{"use_reentrant": false}' 24 | -------------------------------------------------------------------------------- /examples/train/base_to_chat/full.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=2 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | NPROC_PER_NODE=$nproc_per_node \ 5 | swift sft \ 6 | --model Qwen/Qwen2.5-1.5B \ 7 | --train_type full \ 8 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 9 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 10 | 'swift/self-cognition' \ 11 | --torch_dtype bfloat16 \ 12 | --num_train_epochs 10 \ 13 | --per_device_train_batch_size 1 \ 14 | --per_device_eval_batch_size 1 \ 15 | --learning_rate 1e-5 \ 16 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 17 | --eval_steps 200 \ 18 | --save_steps 200 \ 19 | --save_total_limit 2 \ 20 | --logging_steps 5 \ 21 | --max_length 2048 \ 22 | --output_dir output \ 23 | --system 'You are a helpful assistant.' \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 \ 26 | --model_author swift \ 27 | --model_name swift-robot \ 28 | --deepspeed zero2 29 | -------------------------------------------------------------------------------- /examples/train/base_to_chat/lora.sh: -------------------------------------------------------------------------------- 1 | # Use `--template default` 2 | nproc_per_node=2 3 | 4 | CUDA_VISIBLE_DEVICES=0,1 \ 5 | MASTER_PORT=29501 \ 6 | NPROC_PER_NODE=$nproc_per_node \ 7 | swift sft \ 8 | --model Qwen/Qwen2.5-1.5B \ 9 | --train_type lora \ 10 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 11 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 12 | 'swift/self-cognition' \ 13 | --torch_dtype bfloat16 \ 14 | --template default \ 15 | --num_train_epochs 10 \ 16 | --per_device_train_batch_size 1 \ 17 | --per_device_eval_batch_size 1 \ 18 | --learning_rate 1e-4 \ 19 | --lora_rank 8 \ 20 | --lora_alpha 32 \ 21 | --target_modules all-linear \ 22 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 23 | --eval_steps 50 \ 24 | --save_steps 50 \ 25 | --save_total_limit 2 \ 26 | --logging_steps 5 \ 27 | --max_length 2048 \ 28 | --output_dir output \ 29 | --system 'You are a helpful assistant.' \ 30 | --warmup_ratio 0.05 \ 31 | --dataloader_num_workers 4 \ 32 | --model_author swift \ 33 | --model_name swift-robot \ 34 | --deepspeed zero2 35 | -------------------------------------------------------------------------------- /examples/train/embedding/train_gme.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=8 2 | 3 | # losses: plugin/loss.py 4 | # 8*40G 5 | MAX_PIXELS=1003520 \ 6 | NPROC_PER_NODE=$nproc_per_node \ 7 | swift sft \ 8 | --model iic/gme-Qwen2-VL-2B-Instruct \ 9 | --train_type lora \ 10 | --dataset 'swift/TextCaps:emb' \ 11 | --torch_dtype bfloat16 \ 12 | --num_train_epochs 1 \ 13 | --per_device_train_batch_size 2 \ 14 | --per_device_eval_batch_size 2 \ 15 | --gradient_accumulation_steps $(expr 64 / $nproc_per_node) \ 16 | --eval_steps 100 \ 17 | --save_steps 100 \ 18 | --eval_strategy steps \ 19 | --save_total_limit 2 \ 20 | --logging_steps 5 \ 21 | --output_dir output \ 22 | --lazy_tokenize true \ 23 | --warmup_ratio 0.05 \ 24 | --learning_rate 5e-6 \ 25 | --deepspeed zero3 \ 26 | --dataloader_num_workers 4 \ 27 | --task_type embedding \ 28 | --loss_type infonce \ 29 | --dataloader_drop_last true 30 | -------------------------------------------------------------------------------- /examples/train/embedding/train_gte.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=8 2 | # 4*12G 3 | # losses: plugin/loss.py 4 | # data format: docs/source_en/Customization/Custom-dataset.md 5 | # --use_chat_template must be false to use generation template 6 | # --dataloader_drop_last must be true or eval gather will throw error 7 | # --model iic/gte-modernbert-base modernbert also supported 8 | NPROC_PER_NODE=$nproc_per_node \ 9 | swift sft \ 10 | --model iic/gte_Qwen2-7B-instruct \ 11 | --train_type lora \ 12 | --dataset 'sentence-transformers/stsb' \ 13 | --torch_dtype bfloat16 \ 14 | --num_train_epochs 1 \ 15 | --per_device_train_batch_size 2 \ 16 | --per_device_eval_batch_size 1 \ 17 | --gradient_accumulation_steps $(expr 64 / $nproc_per_node) \ 18 | --eval_steps 100 \ 19 | --save_steps 100 \ 20 | --eval_strategy steps \ 21 | --use_chat_template false \ 22 | --save_total_limit 2 \ 23 | --logging_steps 5 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --learning_rate 5e-6 \ 27 | --deepspeed zero3 \ 28 | --dataloader_num_workers 4 \ 29 | --task_type embedding \ 30 | --loss_type cosine_similarity \ 31 | --dataloader_drop_last true \ 32 | -------------------------------------------------------------------------------- /examples/train/full/infer.sh: -------------------------------------------------------------------------------- 1 | # If you are using the validation set for inference, add the parameter `--load_data_args true`. 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift infer \ 4 | --model output/vx-xxx/checkpoint-xxx \ 5 | --stream true \ 6 | --temperature 0 \ 7 | --max_new_tokens 2048 8 | -------------------------------------------------------------------------------- /examples/train/full/qwen2_5_32b.sh: -------------------------------------------------------------------------------- 1 | # 8 * 80GiB 2 | NPROC_PER_NODE=8 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-32B \ 6 | --train_type full \ 7 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 8 | --torch_dtype bfloat16 \ 9 | --max_steps 2000 \ 10 | --streaming true \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 2 \ 15 | --packing true \ 16 | --eval_steps 200 \ 17 | --save_steps 200 \ 18 | --logging_steps 5 \ 19 | --max_length 8192 \ 20 | --warmup_ratio 0.05 \ 21 | --dataloader_num_workers 8 \ 22 | --dataset_num_proc 8 \ 23 | --save_total_limit 2 \ 24 | --save_only_model true \ 25 | --output_dir output/Qwen2.5-32B \ 26 | --deepspeed zero3 \ 27 | --use_liger_kernel true \ 28 | --attn_impl flash_attn 29 | -------------------------------------------------------------------------------- /examples/train/full/train.sh: -------------------------------------------------------------------------------- 1 | # 76GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type full \ 6 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 7 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 8 | 'swift/self-cognition#500' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 16 \ 15 | --eval_steps 100 \ 16 | --save_steps 100 \ 17 | --save_total_limit 2 \ 18 | --logging_steps 5 \ 19 | --max_length 2048 \ 20 | --output_dir output \ 21 | --system 'You are a helpful assistant.' \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 4 \ 24 | --model_author swift \ 25 | --model_name swift-robot 26 | -------------------------------------------------------------------------------- /examples/train/grpo/internal/README.md: -------------------------------------------------------------------------------- 1 | # README: GRPO Internal(Colocate) Mode Execution Scripts 2 | 3 | --- 4 | **NOTE** 5 | The scripts in this folder require the source code version of ms-swift. 6 | 7 | ``` 8 | git clone https://github.com/modelscope/ms-swift.git 9 | cd ms-swift 10 | pip install -e . 11 | ``` 12 | 13 | ## **Introduction** 14 | 15 | The GRPO (Group Relative Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows you to deploy vLLM and perform training using the same GPU resources. 16 | 17 | This folder contains scripts and instructions for running GRPO in **Internal Mode** 18 | 19 | ## Training with Internal mode 20 | ```bash 21 | --use_vllm true \ 22 | --vllm_mode colocate \ 23 | --vllm_gpu_memory_utilization [ut_ratio] \ 24 | ``` 25 | 26 | ## Multi-Node Training 27 | On each node, execute the original single-node training script, using the environment variables `NNODES` and `NODE_RANK`, and ensure consistent use of configuration parameters across all nodes. 28 | -------------------------------------------------------------------------------- /examples/train/grpo/internal/full_lmdeploy.sh: -------------------------------------------------------------------------------- 1 | # The LMDeploy backend in GRPO has been deprecated in Swift 3.5. 2 | # You can install Swift 3.4 to continue using it with the following script: 3 | # https://github.com/modelscope/ms-swift/blob/v3.4.1/examples/train/grpo/internal/full_lmdeploy.sh 4 | -------------------------------------------------------------------------------- /examples/train/grpo/internal/vllm_vl7b.sh: -------------------------------------------------------------------------------- 1 | MAX_PIXELS=1003520 \ 2 | NPROC_PER_NODE=8 \ 3 | swift rlhf \ 4 | --rlhf_type grpo \ 5 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 6 | --train_type lora \ 7 | --dataset AI-ModelScope/chartqa_digit_r1v_format \ 8 | --use_vllm true \ 9 | --vllm_mode colocate \ 10 | --vllm_gpu_memory_utilization 0.5 \ 11 | --vllm_tensor_parallel_size 4 \ 12 | --torch_dtype bfloat16 \ 13 | --system examples/train/grpo/prompt.txt \ 14 | --num_train_epochs 1 \ 15 | --per_device_train_batch_size 1 \ 16 | --per_device_eval_batch_size 1 \ 17 | --learning_rate 1e-6 \ 18 | --save_total_limit 2 \ 19 | --logging_steps 5 \ 20 | --output_dir output \ 21 | --gradient_accumulation_steps 1 \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 4 \ 24 | --max_completion_length 1024 \ 25 | --reward_funcs accuracy format \ 26 | --num_generations 8 \ 27 | --sleep_level 1 \ 28 | --temperature 1.0 \ 29 | --top_p 0.85 30 | -------------------------------------------------------------------------------- /examples/train/grpo/plugin/run_external_reward_model.sh: -------------------------------------------------------------------------------- 1 | # see rm_plugin example in swift/plugin/rm_plugin.py 2 | # register customized plugin in external_plugins file 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 5 | NPROC_PER_NODE=8 \ 6 | swift rlhf \ 7 | --rlhf_type grpo \ 8 | --model Qwen/Qwen2.5-7B \ 9 | --dataset AI-MO/NuminaMath-TIR#5000 \ 10 | --use_vllm true \ 11 | --vllm_mode colocate \ 12 | --vllm_gpu_memory_utilization 0.5 \ 13 | --external_plugins examples/train/grpo/plugin/plugin.py \ 14 | --reward_funcs format \ 15 | --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \ 16 | --reward_model_plugin genrm my_rmplugin \ 17 | --reward_weights 0.1 1 1 \ 18 | --sleep_level 1 \ 19 | --offload_model true \ 20 | --offload_optimizer true \ 21 | --gc_collect_after_offload true \ 22 | --log_completions true \ 23 | --deepspeed zero2 24 | -------------------------------------------------------------------------------- /examples/train/grpo/prompt.txt: -------------------------------------------------------------------------------- 1 | A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here 2 | -------------------------------------------------------------------------------- /examples/train/grpo/qwen2_5_omni/infer.sh: -------------------------------------------------------------------------------- 1 | MAX_PIXELS=1003520 \ 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift infer \ 4 | --adapters vx-xxx/checkpoint-xxx \ 5 | --load_data_args true \ 6 | --stream true \ 7 | --max_new_tokens 2048 8 | -------------------------------------------------------------------------------- /examples/train/infer.sh: -------------------------------------------------------------------------------- 1 | # If it's full parameter training, use `--model xxx` instead of `--adapters xxx`. 2 | # If you are using the validation set for inference, add the parameter `--load_data_args true`. 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift infer \ 5 | --adapters output/vx-xxx/checkpoint-xxx \ 6 | --stream true \ 7 | --temperature 0 \ 8 | --max_new_tokens 2048 9 | -------------------------------------------------------------------------------- /examples/train/lazy_tokenize/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --gradient_accumulation_steps 16 \ 13 | --eval_steps 100 \ 14 | --save_steps 100 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 \ 17 | --lazy_tokenize true \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /examples/train/liger/sft.sh: -------------------------------------------------------------------------------- 1 | # test env: 4 * A100 2 | # Using use_liger_kernel and packing: 4 * 42GB, 1 hour 35 minutes 3 | # Not using use_liger_kernel: 4 * 54GB, 1 hour 40 minutes 4 | # Not using use_liger_kernel and packing: 4 * 52GB, 3 hours 30 minutes 5 | 6 | NPROC_PER_NODE=4 \ 7 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 8 | swift sft \ 9 | --model Qwen/Qwen2.5-7B \ 10 | --train_type full \ 11 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT#10000' \ 12 | --torch_dtype bfloat16 \ 13 | --per_device_train_batch_size 1 \ 14 | --per_device_eval_batch_size 1 \ 15 | --learning_rate 1e-5 \ 16 | --num_train_epochs 5 \ 17 | --gradient_accumulation_steps 2 \ 18 | --eval_steps 100 \ 19 | --save_steps 100 \ 20 | --logging_steps 5 \ 21 | --max_length 8192 \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 8 \ 24 | --dataset_num_proc 8 \ 25 | --save_total_limit 2 \ 26 | --save_only_model true \ 27 | --output_dir output/Qwen2.5-7B \ 28 | --deepspeed zero3 \ 29 | --attn_impl flash_attn \ 30 | --packing true \ 31 | --use_liger_kernel true 32 | -------------------------------------------------------------------------------- /examples/train/long_text/liger_kernel.sh: -------------------------------------------------------------------------------- 1 | # Env: 4 * A100 2 | # https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/long_text.sh 3 | # Max Length: 16K 4 | # GPU Memory: 4 * 42GB, Training Speed 10s/it 5 | NPROC_PER_NODE=4 \ 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 7 | swift sft \ 8 | --model Qwen/Qwen2.5-7B \ 9 | --train_type full \ 10 | --dataset 'AI-ModelScope/LongAlpaca-12k' \ 11 | --torch_dtype bfloat16 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-5 \ 15 | --gradient_accumulation_steps 2 \ 16 | --packing true \ 17 | --eval_steps 200 \ 18 | --save_steps 200 \ 19 | --logging_steps 5 \ 20 | --max_length 16384 \ 21 | --warmup_ratio 0.05 \ 22 | --dataloader_num_workers 8 \ 23 | --dataset_num_proc 8 \ 24 | --save_total_limit 2 \ 25 | --save_only_model true \ 26 | --output_dir output/Qwen2.5-7B \ 27 | --deepspeed zero3 \ 28 | --use_liger_kernel true \ 29 | --attn_impl flash_attn 30 | -------------------------------------------------------------------------------- /examples/train/long_text/sequence_parallel.sh: -------------------------------------------------------------------------------- 1 | # Env: 4 * A100 2 | # Max Length: 65536 3 | # GPU Memory: 4 * 53GiB, Training Speed 50s/it 4 | NPROC_PER_NODE=4 \ 5 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type full \ 9 | --dataset 'AI-ModelScope/LongAlpaca-12k' \ 10 | --torch_dtype bfloat16 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 8 \ 15 | --packing true \ 16 | --rope_scaling yarn \ 17 | --max_length 65536 \ 18 | --eval_steps 200 \ 19 | --save_steps 200 \ 20 | --logging_steps 5 \ 21 | --warmup_ratio 0.05 \ 22 | --dataloader_num_workers 8 \ 23 | --dataset_num_proc 8 \ 24 | --save_total_limit 2 \ 25 | --save_only_model true \ 26 | --output_dir output/Qwen2.5-7B-Instruct \ 27 | --deepspeed zero3 \ 28 | --attn_impl flash_attn \ 29 | --sequence_parallel_size 4 30 | -------------------------------------------------------------------------------- /examples/train/long_text/sequence_parallel_512k.sh: -------------------------------------------------------------------------------- 1 | # Env: 8 * A100 2 | # Max Length: 512000 3 | # GPU Memory: 8 * 80GiB, Training Speed 150s/it 4 | NPROC_PER_NODE=8 \ 5 | CELOSS_PARALLEL_SIZE=2048 \ 6 | swift sft \ 7 | --model Qwen/QwQ-32B \ 8 | --train_type lora \ 9 | --dataset 'AI-ModelScope/LongAlpaca-12k' \ 10 | --torch_dtype bfloat16 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 2 \ 15 | --packing true \ 16 | --rope_scaling yarn \ 17 | --max_length 512000 \ 18 | --eval_steps 200 \ 19 | --save_steps 200 \ 20 | --logging_steps 5 \ 21 | --warmup_ratio 0.05 \ 22 | --dataloader_num_workers 8 \ 23 | --dataset_num_proc 8 \ 24 | --save_total_limit 2 \ 25 | --use_liger_kernel true \ 26 | --save_only_model true \ 27 | --deepspeed zero3_offload \ 28 | --attn_impl flash_attn \ 29 | --sequence_parallel_size 8 30 | -------------------------------------------------------------------------------- /examples/train/long_text/sequence_parallel_dpo.sh: -------------------------------------------------------------------------------- 1 | # Env: 4 * A100 2 | # GPU Memory: 4 * 25GiB, Training Speed 14s/it 3 | NPROC_PER_NODE=4 \ 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | swift rlhf \ 6 | --rlhf_type dpo \ 7 | --model Qwen/Qwen2.5-VL-3B-Instruct \ 8 | --train_type full \ 9 | --dataset swift/RLAIF-V-Dataset \ 10 | --torch_dtype bfloat16 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 8 \ 15 | --eval_steps 200 \ 16 | --save_steps 200 \ 17 | --logging_steps 5 \ 18 | --warmup_ratio 0.05 \ 19 | --dataloader_num_workers 8 \ 20 | --dataset_num_proc 8 \ 21 | --save_total_limit 2 \ 22 | --save_only_model true \ 23 | --output_dir output/Qwen2.5-VL-3B-Instruct \ 24 | --deepspeed zero3 \ 25 | --attn_impl flash_attn \ 26 | --use_liger_kernel true \ 27 | --sequence_parallel_size 4 28 | -------------------------------------------------------------------------------- /examples/train/lora_sft.sh: -------------------------------------------------------------------------------- 1 | # 22GB 2 | # qwen3: https://github.com/modelscope/ms-swift/blob/main/examples/train/think_model/qwen3_demo1.sh 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type lora \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 9 | 'swift/self-cognition#500' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps 16 \ 19 | --eval_steps 50 \ 20 | --save_steps 50 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --system 'You are a helpful assistant.' \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --model_author swift \ 29 | --model_name swift-robot 30 | -------------------------------------------------------------------------------- /examples/train/megatron/base_to_chat.sh: -------------------------------------------------------------------------------- 1 | # 8 * 65GiB 2 | NPROC_PER_NODE=8 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 4 | megatron sft \ 5 | --load Qwen2.5-14B-mcore \ 6 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 7 | --tensor_model_parallel_size 4 \ 8 | --micro_batch_size 1 \ 9 | --global_batch_size 16 \ 10 | --packing true \ 11 | --recompute_granularity selective \ 12 | --train_iters 2000 \ 13 | --eval_iters 50 \ 14 | --finetune true \ 15 | --cross_entropy_loss_fusion true \ 16 | --lr 1e-5 \ 17 | --lr_warmup_iters 100 \ 18 | --min_lr 1e-6 \ 19 | --save megatron_output/Qwen2.5-14B \ 20 | --eval_interval 200 \ 21 | --save_interval 200 \ 22 | --max_length 8192 \ 23 | --num_workers 8 \ 24 | --dataset_num_proc 8 \ 25 | --no_save_optim true \ 26 | --no_save_rng true \ 27 | --sequence_parallel true \ 28 | --use_flash_attn true 29 | -------------------------------------------------------------------------------- /examples/train/megatron/benchmark/deepspeed.sh: -------------------------------------------------------------------------------- 1 | # 8 * 80GiB 2 | # Corresponding Megatron-SWIFT script reference: 3 | # https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron/base_to_chat.sh 4 | NPROC_PER_NODE=8 \ 5 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-14B \ 8 | --train_type full \ 9 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 10 | --torch_dtype bfloat16 \ 11 | --max_steps 2000 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-5 \ 15 | --gradient_accumulation_steps 2 \ 16 | --packing true \ 17 | --eval_steps 200 \ 18 | --save_steps 200 \ 19 | --logging_steps 5 \ 20 | --max_length 8192 \ 21 | --warmup_ratio 0.05 \ 22 | --dataloader_num_workers 8 \ 23 | --dataset_num_proc 8 \ 24 | --save_total_limit -1 \ 25 | --save_only_model true \ 26 | --output_dir output/Qwen2.5-14B \ 27 | --deepspeed zero2 \ 28 | --attn_impl flash_attn 29 | -------------------------------------------------------------------------------- /examples/train/megatron/long_text.sh: -------------------------------------------------------------------------------- 1 | # Env: 4 * A100 2 | # https://github.com/modelscope/ms-swift/blob/main/examples/train/long_text/zero3.sh 3 | # Max Length: 32K 4 | # GPU Memory: 4 * 50GB, Training Speed 23s/it 5 | NPROC_PER_NODE=4 \ 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 7 | megatron sft \ 8 | --load Qwen2.5-7B-mcore \ 9 | --dataset 'ZhipuAI/LongWriter-6k' \ 10 | --tensor_model_parallel_size 4 \ 11 | --micro_batch_size 1 \ 12 | --global_batch_size 8 \ 13 | --packing true \ 14 | --recompute_granularity full \ 15 | --recompute_method uniform \ 16 | --recompute_num_layers 1 \ 17 | --train_iters 1000 \ 18 | --eval_iters 50 \ 19 | --finetune true \ 20 | --cross_entropy_loss_fusion true \ 21 | --lr 1e-5 \ 22 | --lr_warmup_iters 100 \ 23 | --min_lr 1e-6 \ 24 | --save megatron_output/Qwen2.5-7B \ 25 | --eval_interval 200 \ 26 | --save_interval 200 \ 27 | --max_length 32768 \ 28 | --num_workers 8 \ 29 | --dataset_num_proc 8 \ 30 | --no_save_optim true \ 31 | --no_save_rng true \ 32 | --sequence_parallel true \ 33 | --use_flash_attn true 34 | -------------------------------------------------------------------------------- /examples/train/megatron/moe.sh: -------------------------------------------------------------------------------- 1 | # 8 * 65GiB 2 | NPROC_PER_NODE=8 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 4 | megatron sft \ 5 | --load Qwen1.5-MoE-A2.7B-mcore \ 6 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 7 | --tensor_model_parallel_size 2 \ 8 | --expert_model_parallel_size 4 \ 9 | --moe_grouped_gemm true \ 10 | --moe_shared_expert_overlap true \ 11 | --moe_aux_loss_coeff 0.01 \ 12 | --micro_batch_size 1 \ 13 | --global_batch_size 16 \ 14 | --packing true \ 15 | --recompute_granularity selective \ 16 | --train_iters 2000 \ 17 | --eval_iters 50 \ 18 | --finetune true \ 19 | --cross_entropy_loss_fusion true \ 20 | --lr 1e-5 \ 21 | --lr_warmup_iters 100 \ 22 | --min_lr 1e-6 \ 23 | --save megatron_output/Qwen1.5-MoE-A2.7B \ 24 | --eval_interval 200 \ 25 | --save_interval 200 \ 26 | --max_length 8192 \ 27 | --num_workers 8 \ 28 | --dataset_num_proc 8 \ 29 | --no_save_optim true \ 30 | --no_save_rng true \ 31 | --sequence_parallel true \ 32 | --use_flash_attn true 33 | -------------------------------------------------------------------------------- /examples/train/megatron/multi-node/node1.sh: -------------------------------------------------------------------------------- 1 | # For more information on multi-node training launch methods, refer to: 2 | # https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | NNODES=2 \ 6 | NODE_RANK=0 \ 7 | MASTER_ADDR=127.0.0.1 \ 8 | MASTER_PORT=29500 \ 9 | NPROC_PER_NODE=4 \ 10 | megatron sft \ 11 | --load Qwen2.5-14B-mcore \ 12 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 13 | --tensor_model_parallel_size 4 \ 14 | --micro_batch_size 1 \ 15 | --global_batch_size 16 \ 16 | --packing true \ 17 | --recompute_granularity selective \ 18 | --train_iters 2000 \ 19 | --eval_iters 50 \ 20 | --finetune true \ 21 | --cross_entropy_loss_fusion true \ 22 | --lr 1e-5 \ 23 | --lr_warmup_iters 100 \ 24 | --min_lr 1e-6 \ 25 | --save megatron_output/Qwen2.5-14B \ 26 | --eval_interval 200 \ 27 | --save_interval 200 \ 28 | --max_length 8192 \ 29 | --num_workers 8 \ 30 | --dataset_num_proc 8 \ 31 | --no_save_optim true \ 32 | --no_save_rng true \ 33 | --sequence_parallel true \ 34 | --use_flash_attn true 35 | -------------------------------------------------------------------------------- /examples/train/megatron/multi-node/node2.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 2 | NNODES=2 \ 3 | NODE_RANK=1 \ 4 | MASTER_ADDR=xxx.xxx.xxx.xxx \ 5 | MASTER_PORT=29500 \ 6 | NPROC_PER_NODE=4 \ 7 | megatron sft \ 8 | --load Qwen2.5-14B-mcore \ 9 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 10 | --tensor_model_parallel_size 4 \ 11 | --micro_batch_size 1 \ 12 | --global_batch_size 16 \ 13 | --packing true \ 14 | --recompute_granularity selective \ 15 | --train_iters 2000 \ 16 | --eval_iters 50 \ 17 | --finetune true \ 18 | --cross_entropy_loss_fusion true \ 19 | --lr 1e-5 \ 20 | --lr_warmup_iters 100 \ 21 | --min_lr 1e-6 \ 22 | --save megatron_output/Qwen2.5-14B \ 23 | --eval_interval 200 \ 24 | --save_interval 200 \ 25 | --max_length 8192 \ 26 | --num_workers 8 \ 27 | --dataset_num_proc 8 \ 28 | --no_save_optim true \ 29 | --no_save_rng true \ 30 | --sequence_parallel true \ 31 | --use_flash_attn true 32 | -------------------------------------------------------------------------------- /examples/train/megatron/pretrain.sh: -------------------------------------------------------------------------------- 1 | # 4 * 80GiB 2 | NPROC_PER_NODE=4 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 4 | megatron pt \ 5 | --load Qwen2.5-7B-mcore \ 6 | --dataset swift/chinese-c4 \ 7 | --streaming true \ 8 | --packing true \ 9 | --tensor_model_parallel_size 4 \ 10 | --micro_batch_size 1 \ 11 | --global_batch_size 16 \ 12 | --recompute_granularity selective \ 13 | --train_iters 10000 \ 14 | --eval_iters 100 \ 15 | --finetune true \ 16 | --cross_entropy_loss_fusion true \ 17 | --lr 1e-5 \ 18 | --lr_warmup_iters 300 \ 19 | --min_lr 1e-6 \ 20 | --save megatron_output/Qwen2.5-7B \ 21 | --eval_interval 500 \ 22 | --save_interval 500 \ 23 | --max_length 8192 \ 24 | --num_workers 4 \ 25 | --dataset_num_proc 8 \ 26 | --no_save_optim true \ 27 | --no_save_rng true \ 28 | --sequence_parallel true \ 29 | --use_flash_attn true 30 | -------------------------------------------------------------------------------- /examples/train/megatron/qwen3_32b.sh: -------------------------------------------------------------------------------- 1 | # 8 * 80GiB 2 | PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \ 3 | NPROC_PER_NODE=8 \ 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 5 | megatron sft \ 6 | --load Qwen3-32B-mcore \ 7 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 8 | --tensor_model_parallel_size 8 \ 9 | --micro_batch_size 1 \ 10 | --global_batch_size 16 \ 11 | --packing true \ 12 | --recompute_granularity full \ 13 | --recompute_method uniform \ 14 | --recompute_num_layers 1 \ 15 | --train_iters 10000 \ 16 | --max_epochs 5 \ 17 | --eval_iters 50 \ 18 | --finetune true \ 19 | --cross_entropy_loss_fusion true \ 20 | --lr 1e-5 \ 21 | --lr_warmup_iters 100 \ 22 | --min_lr 1e-6 \ 23 | --save megatron_output/Qwen3-32B \ 24 | --eval_interval 500 \ 25 | --save_interval 500 \ 26 | --max_length 8192 \ 27 | --num_workers 8 \ 28 | --dataset_num_proc 8 \ 29 | --no_save_optim true \ 30 | --no_save_rng true \ 31 | --sequence_parallel true \ 32 | --attention_backend flash 33 | -------------------------------------------------------------------------------- /examples/train/megatron/sft.sh: -------------------------------------------------------------------------------- 1 | # 2 * 80GiB 2 | NPROC_PER_NODE=2 \ 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | megatron sft \ 5 | --load Qwen2.5-7B-Instruct-mcore \ 6 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 7 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 8 | 'swift/self-cognition#500' \ 9 | --tensor_model_parallel_size 2 \ 10 | --micro_batch_size 4 \ 11 | --global_batch_size 16 \ 12 | --recompute_granularity selective \ 13 | --train_iters 100 \ 14 | --eval_iters 5 \ 15 | --finetune true \ 16 | --cross_entropy_loss_fusion true \ 17 | --lr 1e-5 \ 18 | --lr_warmup_iters 10 \ 19 | --min_lr 1e-6 \ 20 | --save megatron_output/Qwen2.5-7B-Instruct \ 21 | --save_interval 100 \ 22 | --max_length 2048 \ 23 | --system 'You are a helpful assistant.' \ 24 | --num_workers 4 \ 25 | --no_save_optim true \ 26 | --no_save_rng true \ 27 | --dataset_num_proc 4 \ 28 | --model_author swift \ 29 | --model_name swift-robot 30 | -------------------------------------------------------------------------------- /examples/train/moe/llama4.sh: -------------------------------------------------------------------------------- 1 | # Manually select `target_modules` to avoid 'all-linear' selecting 'router' 2 | NPROC_PER_NODE=4 \ 3 | USE_HF=1 \ 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | swift sft \ 6 | --model meta-llama/Llama-4-Scout-17B-16E-Instruct \ 7 | --dataset 'linxy/LaTeX_OCR:full#5000' \ 8 | --train_type lora \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_regex '^(language_model).*\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$' \ 17 | --freeze_vit true \ 18 | --gradient_accumulation_steps 4 \ 19 | --gradient_checkpointing true \ 20 | --eval_steps 50 \ 21 | --save_steps 50 \ 22 | --save_total_limit 2 \ 23 | --logging_steps 5 \ 24 | --max_length 2048 \ 25 | --output_dir output \ 26 | --warmup_ratio 0.05 \ 27 | --deepspeed zero3 \ 28 | --dataloader_num_workers 4 29 | -------------------------------------------------------------------------------- /examples/train/moe/qwen2_5_moe.sh: -------------------------------------------------------------------------------- 1 | # Manually select `target_modules` to avoid 'all-linear' selecting 'gate' 2 | CUDA_VISIBLE_DEVICES=0,1 \ 3 | swift sft \ 4 | --model Qwen/Qwen2-57B-A14B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 7 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 8 | 'swift/self-cognition#500' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 50 \ 19 | --save_steps 50 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --system 'You are a helpful assistant.' \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --model_author swift \ 28 | --model_name swift-robot 29 | -------------------------------------------------------------------------------- /examples/train/multi-gpu/ddp/train.sh: -------------------------------------------------------------------------------- 1 | # 27.5GiB * 2 2 | nproc_per_node=2 3 | 4 | CUDA_VISIBLE_DEVICES=0,1 \ 5 | NPROC_PER_NODE=$nproc_per_node \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --torch_dtype bfloat16 \ 10 | --dataset 'swift/self-cognition#1000' \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --system 'You are a helpful assistant.' \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --model_author swift \ 29 | --model_name swift-robot \ 30 | --gradient_checkpointing_kwargs '{"use_reentrant": false}' 31 | -------------------------------------------------------------------------------- /examples/train/multi-gpu/ddp_device_map/train.sh: -------------------------------------------------------------------------------- 1 | # 14GiB * 4 2 | nproc_per_node=2 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | NPROC_PER_NODE=$nproc_per_node \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset 'swift/self-cognition#1000' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --system 'You are a helpful assistant.' \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --model_author swift \ 29 | --model_name swift-robot \ 30 | --gradient_checkpointing_kwargs '{"use_reentrant": false}' 31 | -------------------------------------------------------------------------------- /examples/train/multi-gpu/deepspeed/train_zero2.sh: -------------------------------------------------------------------------------- 1 | # 18GiB * 2 2 | nproc_per_node=2 3 | 4 | CUDA_VISIBLE_DEVICES=0,1 \ 5 | NPROC_PER_NODE=$nproc_per_node \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset 'swift/self-cognition#1000' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --system 'You are a helpful assistant.' \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --model_author swift \ 29 | --model_name swift-robot \ 30 | --deepspeed zero2 31 | -------------------------------------------------------------------------------- /examples/train/multi-gpu/deepspeed/train_zero3.sh: -------------------------------------------------------------------------------- 1 | # 16GiB * 2 2 | nproc_per_node=2 3 | 4 | CUDA_VISIBLE_DEVICES=0,1 \ 5 | NPROC_PER_NODE=$nproc_per_node \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset 'swift/self-cognition#1000' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --system 'You are a helpful assistant.' \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --model_author swift \ 29 | --model_name swift-robot \ 30 | --deepspeed zero3 31 | -------------------------------------------------------------------------------- /examples/train/multi-gpu/device_map/train.sh: -------------------------------------------------------------------------------- 1 | # 2 * 76GiB 2 | CUDA_VISIBLE_DEVICES=0,1 \ 3 | MAX_PIXELS=1003520 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-VL-72B-Instruct \ 6 | --dataset 'modelscope/coco_2014_caption:validation#20000' \ 7 | --train_type lora \ 8 | --torch_dtype bfloat16 \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 1 \ 12 | --learning_rate 1e-4 \ 13 | --lora_rank 8 \ 14 | --lora_alpha 32 \ 15 | --target_modules all-linear \ 16 | --freeze_vit true \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 100 \ 19 | --save_steps 100 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 26 | -------------------------------------------------------------------------------- /examples/train/multi-gpu/fsdp_qlora/fsdp_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "compute_environment": "LOCAL_MACHINE", 3 | "debug": false, 4 | "distributed_type": "FSDP", 5 | "downcast_bf16": "no", 6 | "fsdp_config": { 7 | "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", 8 | "fsdp_backward_prefetch": "BACKWARD_PRE", 9 | "fsdp_cpu_ram_efficient_loading": true, 10 | "fsdp_forward_prefetch": false, 11 | "fsdp_offload_params": true, 12 | "fsdp_sharding_strategy": "FULL_SHARD", 13 | "fsdp_state_dict_type": "FULL_STATE_DICT", 14 | "fsdp_sync_module_states": true, 15 | "fsdp_use_orig_params": false 16 | }, 17 | "machine_rank": 0, 18 | "main_training_function": "main", 19 | "mixed_precision": "no", 20 | "num_machines": 1, 21 | "num_processes": 2, 22 | "rdzv_backend": "static", 23 | "same_network": true, 24 | "tpu_env": [], 25 | "tpu_use_cluster": false, 26 | "tpu_use_sudo": false, 27 | "use_cpu": false 28 | } 29 | -------------------------------------------------------------------------------- /examples/train/multi-node/accelerate/multi_node.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | gradient_accumulation_steps: 16 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 3 9 | distributed_type: DEEPSPEED 10 | main_process_ip: 'xxx.xxx.xxx.xxx' 11 | main_process_port: 29500 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 2 15 | num_processes: 8 # world size 16 | rdzv_backend: static 17 | use_cpu: false 18 | -------------------------------------------------------------------------------- /examples/train/multi-node/accelerate/train_node1.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 2 | accelerate launch --config_file ./examples/train/multi-node/accelerate/multi_node.yaml --machine_rank 0 \ 3 | swift/cli/sft.py \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --torch_dtype bfloat16 \ 7 | --dataset 'swift/self-cognition#1000' \ 8 | --num_train_epochs 1 \ 9 | --lora_rank 8 \ 10 | --lora_alpha 32 \ 11 | --learning_rate 1e-4 \ 12 | --gradient_accumulation_steps 16 \ 13 | --eval_steps 100 \ 14 | --save_steps 100 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 \ 17 | --model_author swift \ 18 | --model_name swift-robot 19 | -------------------------------------------------------------------------------- /examples/train/multi-node/accelerate/train_node2.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 2 | accelerate launch --config_file ./examples/train/multi-node/accelerate/multi_node.yaml --machine_rank 1 \ 3 | swift/cli/sft.py \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --torch_dtype bfloat16 \ 7 | --dataset 'swift/self-cognition#1000' \ 8 | --num_train_epochs 1 \ 9 | --lora_rank 8 \ 10 | --lora_alpha 32 \ 11 | --learning_rate 1e-4 \ 12 | --gradient_accumulation_steps 16 \ 13 | --eval_steps 100 \ 14 | --save_steps 100 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 \ 17 | --model_author swift \ 18 | --model_name swift-robot 19 | -------------------------------------------------------------------------------- /examples/train/multi-node/deepspeed/host.txt: -------------------------------------------------------------------------------- 1 | worker-0 slots=2 2 | worker-1 slots=2 3 | -------------------------------------------------------------------------------- /examples/train/multi-node/deepspeed/train.sh: -------------------------------------------------------------------------------- 1 | # If your need only a part of the GPUs in every node, try: 2 | # --include="worker-0:0,1@worker-1:2,3" 3 | deepspeed --hostfile=./examples/train/multi-node-deepspeed/host.txt \ 4 | swift/cli/sft.py \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type lora \ 7 | --torch_dtype bfloat16 \ 8 | --dataset 'swift/self-cognition#1000' \ 9 | --num_train_epochs 1 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --learning_rate 1e-4 \ 13 | --gradient_accumulation_steps 16 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /examples/train/multi-node/dlc/train.sh: -------------------------------------------------------------------------------- 1 | # https://help.aliyun.com/zh/pai/user-guide/general-environment-variables 2 | NNODES=$WORLD_SIZE \ 3 | NODE_RANK=$RANK \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type full \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#20000' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 4 \ 15 | --eval_steps 100 \ 16 | --save_steps 100 \ 17 | --save_total_limit 2 \ 18 | --logging_steps 5 \ 19 | --max_length 8192 \ 20 | --output_dir output \ 21 | --system 'You are a helpful assistant.' \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 4 \ 24 | --deepspeed zero2 25 | -------------------------------------------------------------------------------- /examples/train/multi-node/swift/train_node1.sh: -------------------------------------------------------------------------------- 1 | nnodes=2 2 | nproc_per_node=4 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | NNODES=$nnodes \ 6 | NODE_RANK=0 \ 7 | MASTER_ADDR=127.0.0.1 \ 8 | MASTER_PORT=29500 \ 9 | NPROC_PER_NODE=$nproc_per_node \ 10 | swift sft \ 11 | --model Qwen/Qwen2.5-7B-Instruct \ 12 | --train_type full \ 13 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \ 14 | 'AI-ModelScope/alpaca-gpt4-data-en#20000' \ 15 | --torch_dtype bfloat16 \ 16 | --num_train_epochs 1 \ 17 | --per_device_train_batch_size 1 \ 18 | --per_device_eval_batch_size 1 \ 19 | --learning_rate 1e-5 \ 20 | --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \ 21 | --eval_steps 100 \ 22 | --save_steps 100 \ 23 | --save_total_limit 2 \ 24 | --logging_steps 5 \ 25 | --max_length 8192 \ 26 | --output_dir output \ 27 | --system 'You are a helpful assistant.' \ 28 | --warmup_ratio 0.05 \ 29 | --dataloader_num_workers 4 \ 30 | --deepspeed zero2 31 | -------------------------------------------------------------------------------- /examples/train/multi-node/swift/train_node2.sh: -------------------------------------------------------------------------------- 1 | nnodes=2 2 | nproc_per_node=4 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | NNODES=$nnodes \ 6 | NODE_RANK=1 \ 7 | MASTER_ADDR=xxx.xxx.xxx.xxx \ 8 | MASTER_PORT=29500 \ 9 | NPROC_PER_NODE=$nproc_per_node \ 10 | swift sft \ 11 | --model Qwen/Qwen2.5-7B-Instruct \ 12 | --train_type full \ 13 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \ 14 | 'AI-ModelScope/alpaca-gpt4-data-en#20000' \ 15 | --torch_dtype bfloat16 \ 16 | --num_train_epochs 1 \ 17 | --per_device_train_batch_size 1 \ 18 | --per_device_eval_batch_size 1 \ 19 | --learning_rate 1e-5 \ 20 | --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \ 21 | --eval_steps 100 \ 22 | --save_steps 100 \ 23 | --save_total_limit 2 \ 24 | --logging_steps 5 \ 25 | --max_length 8192 \ 26 | --output_dir output \ 27 | --system 'You are a helpful assistant.' \ 28 | --warmup_ratio 0.05 \ 29 | --dataloader_num_workers 4 \ 30 | --deepspeed zero2 31 | -------------------------------------------------------------------------------- /examples/train/multi-node/torchrun/train_node1.sh: -------------------------------------------------------------------------------- 1 | nnodes=2 2 | nproc_per_node=4 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | torchrun \ 6 | --master_port 29500 \ 7 | --nproc_per_node=$nproc_per_node \ 8 | --nnodes=$nnodes \ 9 | --node_rank=0 \ 10 | --master_addr=127.0.0.1 \ 11 | swift/cli/sft.py \ 12 | --model Qwen/Qwen2.5-7B-Instruct \ 13 | --train_type full \ 14 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \ 15 | 'AI-ModelScope/alpaca-gpt4-data-en#20000' \ 16 | --torch_dtype bfloat16 \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 1 \ 19 | --per_device_eval_batch_size 1 \ 20 | --learning_rate 1e-5 \ 21 | --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \ 22 | --eval_steps 100 \ 23 | --save_steps 100 \ 24 | --save_total_limit 2 \ 25 | --logging_steps 5 \ 26 | --max_length 8192 \ 27 | --output_dir output \ 28 | --system 'You are a helpful assistant.' \ 29 | --warmup_ratio 0.05 \ 30 | --dataloader_num_workers 4 \ 31 | --deepspeed zero2 32 | -------------------------------------------------------------------------------- /examples/train/multi-node/torchrun/train_node2.sh: -------------------------------------------------------------------------------- 1 | nnodes=2 2 | nproc_per_node=4 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | torchrun \ 6 | --master_port 29500 \ 7 | --nproc_per_node=$nproc_per_node \ 8 | --nnodes=$nnodes \ 9 | --node_rank=1 \ 10 | --master_addr=xxx.xxx.xxx.xxx \ 11 | swift/cli/sft.py \ 12 | --model Qwen/Qwen2.5-7B-Instruct \ 13 | --train_type full \ 14 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \ 15 | 'AI-ModelScope/alpaca-gpt4-data-en#20000' \ 16 | --torch_dtype bfloat16 \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 1 \ 19 | --per_device_eval_batch_size 1 \ 20 | --learning_rate 1e-5 \ 21 | --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \ 22 | --eval_steps 100 \ 23 | --save_steps 100 \ 24 | --save_total_limit 2 \ 25 | --logging_steps 5 \ 26 | --max_length 8192 \ 27 | --output_dir output \ 28 | --system 'You are a helpful assistant.' \ 29 | --warmup_ratio 0.05 \ 30 | --dataloader_num_workers 4 \ 31 | --deepspeed zero2 32 | -------------------------------------------------------------------------------- /examples/train/multimodal/audio.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model Qwen/Qwen2-Audio-7B-Instruct \ 4 | --dataset 'speech_asr/speech_asr_aishell1_trainsets:validation#20000' \ 5 | --train_type lora \ 6 | --torch_dtype bfloat16 \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --per_device_eval_batch_size 1 \ 10 | --learning_rate 1e-4 \ 11 | --lora_rank 8 \ 12 | --lora_alpha 32 \ 13 | --target_modules all-linear \ 14 | --freeze_vit true \ 15 | --gradient_accumulation_steps 16 \ 16 | --eval_steps 100 \ 17 | --save_steps 100 \ 18 | --save_total_limit 2 \ 19 | --logging_steps 5 \ 20 | --max_length 2048 \ 21 | --output_dir output \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 4 24 | -------------------------------------------------------------------------------- /examples/train/multimodal/caption.sh: -------------------------------------------------------------------------------- 1 | # 22GiB 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter. 3 | # 1003520 = 1280 * 28 * 28 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | MAX_PIXELS=1003520 \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 8 | --dataset 'modelscope/coco_2014_caption:validation#20000' \ 9 | --train_type lora \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --freeze_vit true \ 19 | --gradient_accumulation_steps 16 \ 20 | --eval_steps 100 \ 21 | --save_steps 100 \ 22 | --save_total_limit 2 \ 23 | --logging_steps 5 \ 24 | --max_length 2048 \ 25 | --output_dir output \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 28 | -------------------------------------------------------------------------------- /examples/train/multimodal/grounding.sh: -------------------------------------------------------------------------------- 1 | # 20GiB 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter. 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | MAX_PIXELS=1003520 \ 5 | swift sft \ 6 | --model Qwen/Qwen2-VL-7B-Instruct \ 7 | --dataset 'AI-ModelScope/coco#20000' \ 8 | --train_type lora \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --freeze_vit true \ 18 | --gradient_accumulation_steps 16 \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --dataset_num_proc 4 28 | -------------------------------------------------------------------------------- /examples/train/multimodal/infer.sh: -------------------------------------------------------------------------------- 1 | # Perform inference using the validation set from the training phase. 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | MAX_PIXELS=1003520 \ 4 | swift infer \ 5 | --adapters output/vx-xxx/checkpoint-xxx \ 6 | --stream true \ 7 | --load_data_args true \ 8 | --max_new_tokens 2048 9 | -------------------------------------------------------------------------------- /examples/train/multimodal/lora_llm_full_vit/infer.sh: -------------------------------------------------------------------------------- 1 | # If the weights have been merged, please use `--model`. 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift infer \ 4 | --adapters output/vx-xxx/checkpoint-xxx \ 5 | --stream true \ 6 | --load_data_args true \ 7 | --temperature 0 \ 8 | --max_new_tokens 2048 9 | -------------------------------------------------------------------------------- /examples/train/multimodal/lora_llm_full_vit/merge_lora.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift export \ 3 | --adapters output/vx-xxx/checkpoint-xxx \ 4 | --merge_lora true 5 | 6 | # CUDA_VISIBLE_DEVICES=0 \ 7 | # swift infer \ 8 | # --model output/vx-xxx/checkpoint-xxx-merged \ 9 | # --stream true \ 10 | # --load_data_args true \ 11 | # --temperature 0 \ 12 | # --max_new_tokens 2048 13 | -------------------------------------------------------------------------------- /examples/train/multimodal/lora_llm_full_vit/sft.sh: -------------------------------------------------------------------------------- 1 | # 4 * 22GiB 2 | # vit/merger lr 1e-5; llm lora lr 1e-4 3 | NPROC_PER_NODE=4 \ 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 5 | MAX_PIXELS=1003520 \ 6 | swift sft \ 7 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 8 | --dataset 'AI-ModelScope/coco#20000' \ 9 | --train_type custom \ 10 | --external_plugins 'examples/train/multimodal/lora_llm_full_vit/custom_plugin.py' \ 11 | --torch_dtype bfloat16 \ 12 | --num_train_epochs 1 \ 13 | --per_device_train_batch_size 1 \ 14 | --per_device_eval_batch_size 1 \ 15 | --learning_rate 1e-4 \ 16 | --vit_lr 1e-5 \ 17 | --aligner_lr 1e-5 \ 18 | --lora_rank 16 \ 19 | --lora_alpha 32 \ 20 | --gradient_accumulation_steps 4 \ 21 | --eval_steps 100 \ 22 | --save_steps 100 \ 23 | --save_total_limit 2 \ 24 | --logging_steps 5 \ 25 | --max_length 8192 \ 26 | --output_dir output \ 27 | --warmup_ratio 0.05 \ 28 | --dataloader_num_workers 4 \ 29 | --dataset_num_proc 4 \ 30 | --deepspeed zero2 \ 31 | --save_only_model true 32 | -------------------------------------------------------------------------------- /examples/train/multimodal/ocr.sh: -------------------------------------------------------------------------------- 1 | # 20GB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | MAX_PIXELS=1003520 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 6 | --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \ 7 | --train_type lora \ 8 | --torch_dtype bfloat16 \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 1 \ 12 | --learning_rate 1e-4 \ 13 | --lora_rank 8 \ 14 | --lora_alpha 32 \ 15 | --target_modules all-linear \ 16 | --freeze_vit true \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 50 \ 19 | --save_steps 50 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 26 | -------------------------------------------------------------------------------- /examples/train/multimodal/omni/infer.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | VIDEO_MAX_PIXELS=50176 \ 3 | FPS_MAX_FRAMES=12 \ 4 | MAX_PIXELS=1003520 \ 5 | ENABLE_AUDIO_OUTPUT=0 \ 6 | swift infer \ 7 | --adapters output/vx-xxx/checkpoint-xxx \ 8 | --stream true \ 9 | --load_data_args true \ 10 | --max_new_tokens 2048 11 | -------------------------------------------------------------------------------- /examples/train/multimodal/rlhf/dpo/full.sh: -------------------------------------------------------------------------------- 1 | # 4 * 50GiB 2 | nproc_per_node=4 3 | 4 | PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \ 5 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 6 | NPROC_PER_NODE=$nproc_per_node \ 7 | MAX_PIXELS=1003520 \ 8 | swift rlhf \ 9 | --rlhf_type dpo \ 10 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 11 | --dataset 'swift/RLAIF-V-Dataset#20000' \ 12 | --train_type full \ 13 | --torch_dtype bfloat16 \ 14 | --num_train_epochs 1 \ 15 | --per_device_train_batch_size 1 \ 16 | --per_device_eval_batch_size 1 \ 17 | --learning_rate 1e-5 \ 18 | --freeze_vit true \ 19 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 20 | --eval_steps 100 \ 21 | --save_steps 100 \ 22 | --save_total_limit 2 \ 23 | --deepspeed zero3 \ 24 | --logging_steps 5 \ 25 | --max_length 4096 \ 26 | --output_dir output \ 27 | --warmup_ratio 0.05 \ 28 | --dataloader_num_workers 4 \ 29 | --dataset_num_proc 4 \ 30 | --save_only_model true 31 | -------------------------------------------------------------------------------- /examples/train/multimodal/rlhf/dpo/lora.sh: -------------------------------------------------------------------------------- 1 | # 4 * 50GiB 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter. 3 | # --rlhf_type cpo/orpo/simpo/rm are also supported 4 | nproc_per_node=2 5 | 6 | CUDA_VISIBLE_DEVICES=0,1 \ 7 | NPROC_PER_NODE=$nproc_per_node \ 8 | MAX_PIXELS=1003520 \ 9 | swift rlhf \ 10 | --rlhf_type dpo \ 11 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 12 | --dataset 'swift/RLAIF-V-Dataset#20000' \ 13 | --train_type lora \ 14 | --torch_dtype bfloat16 \ 15 | --num_train_epochs 1 \ 16 | --per_device_train_batch_size 1 \ 17 | --per_device_eval_batch_size 1 \ 18 | --learning_rate 1e-4 \ 19 | --lora_rank 8 \ 20 | --lora_alpha 32 \ 21 | --target_modules all-linear \ 22 | --freeze_vit true \ 23 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 24 | --eval_steps 100 \ 25 | --save_steps 100 \ 26 | --save_total_limit 2 \ 27 | --deepspeed zero2 \ 28 | --logging_steps 5 \ 29 | --max_length 4096 \ 30 | --output_dir output \ 31 | --warmup_ratio 0.05 \ 32 | --dataloader_num_workers 4 \ 33 | --dataset_num_proc 4 34 | -------------------------------------------------------------------------------- /examples/train/multimodal/rlhf/kto.sh: -------------------------------------------------------------------------------- 1 | # Due to the absence of a multi-modal open-source dataset for kto, 2 | # we will use a pure text kto dataset as an example here. 3 | nproc_per_node=2 4 | 5 | CUDA_VISIBLE_DEVICES=0,1 \ 6 | NPROC_PER_NODE=$nproc_per_node \ 7 | MAX_PIXELS=1003520 \ 8 | swift rlhf \ 9 | --rlhf_type kto \ 10 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 11 | --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \ 12 | --train_type lora \ 13 | --torch_dtype bfloat16 \ 14 | --num_train_epochs 1 \ 15 | --per_device_train_batch_size 1 \ 16 | --per_device_eval_batch_size 1 \ 17 | --learning_rate 1e-4 \ 18 | --lora_rank 8 \ 19 | --lora_alpha 32 \ 20 | --target_modules all-linear \ 21 | --freeze_vit true \ 22 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 23 | --eval_steps 100 \ 24 | --save_steps 100 \ 25 | --save_total_limit 2 \ 26 | --deepspeed zero2 \ 27 | --logging_steps 5 \ 28 | --max_length 4096 \ 29 | --output_dir output \ 30 | --warmup_ratio 0.05 \ 31 | --dataloader_num_workers 4 \ 32 | --dataset_num_proc 4 33 | -------------------------------------------------------------------------------- /examples/train/multimodal/video.sh: -------------------------------------------------------------------------------- 1 | # 4*80GB 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `VIDEO_MAX_PIXELS` parameter. 3 | nproc_per_node=4 4 | 5 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 6 | NPROC_PER_NODE=$nproc_per_node \ 7 | VIDEO_MAX_PIXELS=50176 \ 8 | FPS_MAX_FRAMES=12 \ 9 | swift sft \ 10 | --model Qwen/QVQ-72B-Preview \ 11 | --dataset swift/VideoChatGPT:all \ 12 | --train_type lora \ 13 | --torch_dtype bfloat16 \ 14 | --num_train_epochs 1 \ 15 | --per_device_train_batch_size 1 \ 16 | --per_device_eval_batch_size 1 \ 17 | --learning_rate 1e-4 \ 18 | --lora_rank 8 \ 19 | --lora_alpha 32 \ 20 | --target_modules all-linear \ 21 | --freeze_vit true \ 22 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 23 | --eval_steps 50 \ 24 | --save_steps 50 \ 25 | --save_total_limit 2 \ 26 | --logging_steps 5 \ 27 | --max_length 2048 \ 28 | --output_dir output \ 29 | --warmup_ratio 0.05 \ 30 | --dataloader_num_workers 4 \ 31 | --deepspeed zero3 32 | -------------------------------------------------------------------------------- /examples/train/packing/llm.sh: -------------------------------------------------------------------------------- 1 | # 22GB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --packing true \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 9 | 'swift/self-cognition#500' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 3 \ 12 | --attn_impl flash_attn \ 13 | --per_device_train_batch_size 1 \ 14 | --per_device_eval_batch_size 1 \ 15 | --learning_rate 1e-4 \ 16 | --lora_rank 8 \ 17 | --lora_alpha 32 \ 18 | --target_modules all-linear \ 19 | --gradient_accumulation_steps 4 \ 20 | --eval_steps 50 \ 21 | --save_steps 50 \ 22 | --save_total_limit 2 \ 23 | --logging_steps 5 \ 24 | --max_length 2048 \ 25 | --output_dir output \ 26 | --system 'You are a helpful assistant.' \ 27 | --warmup_ratio 0.05 \ 28 | --dataloader_num_workers 4 \ 29 | --dataset_num_proc 4 \ 30 | --model_author swift \ 31 | --model_name swift-robot 32 | -------------------------------------------------------------------------------- /examples/train/packing/streaming.sh: -------------------------------------------------------------------------------- 1 | # 4 * 36GB 2 | # A demo using the Hugging Face dataset 3 | # The first model weights will be saved around step 70. 4 | NPROC_PER_NODE=4 \ 5 | MAX_PIXELS=1003520 \ 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 7 | HF_ENDPOINT=https://hf-mirror.com \ 8 | swift sft \ 9 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 10 | --train_type lora \ 11 | --dataset 'HF::linxy/LaTeX_OCR:full#20000' \ 12 | --torch_dtype bfloat16 \ 13 | --attn_impl flash_attn \ 14 | --streaming true \ 15 | --shuffle_buffer_size 1000 \ 16 | --packing true \ 17 | --save_strategy epoch \ 18 | --max_steps 1000 \ 19 | --max_epochs 5 \ 20 | --per_device_train_batch_size 1 \ 21 | --per_device_eval_batch_size 1 \ 22 | --learning_rate 1e-4 \ 23 | --lora_rank 8 \ 24 | --lora_alpha 32 \ 25 | --target_modules all-linear \ 26 | --gradient_accumulation_steps 1 \ 27 | --save_total_limit 2 \ 28 | --logging_steps 5 \ 29 | --max_length 8192 \ 30 | --output_dir output \ 31 | --warmup_ratio 0.05 \ 32 | --dataloader_num_workers 1 \ 33 | --dataset_num_proc 8 \ 34 | --deepspeed zero2 35 | -------------------------------------------------------------------------------- /examples/train/padding_free/sft.sh: -------------------------------------------------------------------------------- 1 | # Supported multimodal models reference: 2 | # https://github.com/modelscope/ms-swift/blob/main/examples/train/packing/qwen2_5_vl.sh 3 | # without padding_free: 4 * 60GiB, 26h 4 | # padding_free: 4 * 44GiB, 13h 5 | NPROC_PER_NODE=4 \ 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 7 | swift sft \ 8 | --model Qwen/Qwen2.5-7B \ 9 | --train_type full \ 10 | --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ 11 | --torch_dtype bfloat16 \ 12 | --per_device_train_batch_size 8 \ 13 | --per_device_eval_batch_size 8 \ 14 | --learning_rate 1e-5 \ 15 | --gradient_accumulation_steps 1 \ 16 | --eval_steps 200 \ 17 | --save_steps 200 \ 18 | --logging_steps 5 \ 19 | --max_length 8192 \ 20 | --warmup_ratio 0.05 \ 21 | --dataloader_num_workers 8 \ 22 | --dataset_num_proc 8 \ 23 | --save_total_limit 2 \ 24 | --save_only_model true \ 25 | --output_dir output/Qwen2.5-7B \ 26 | --deepspeed zero3 \ 27 | --use_liger_kernel true \ 28 | --attn_impl flash_attn \ 29 | --padding_free true 30 | -------------------------------------------------------------------------------- /examples/train/plugins/loss_scale.sh: -------------------------------------------------------------------------------- 1 | # loss_scale all to train all tokens 2 | # use loss_type loss_scale 3 | # This is just an example 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift sft \ 6 | --model Qwen/Qwen2.5-7B-Instruct \ 7 | --train_type lora \ 8 | --dataset 'swift/self-cognition#1000' \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --learning_rate 1e-4 \ 12 | --lora_rank 8 \ 13 | --lora_alpha 32 \ 14 | --gradient_accumulation_steps 16 \ 15 | --eval_steps 100 \ 16 | --save_steps 100 \ 17 | --save_total_limit 2 \ 18 | --logging_steps 5 \ 19 | --model_author swift \ 20 | --model_name swift-robot \ 21 | --loss_scale all \ 22 | --loss_type loss_scale 23 | -------------------------------------------------------------------------------- /examples/train/plugins/tuner_phi4_mm.sh: -------------------------------------------------------------------------------- 1 | # `--train_type dummy` 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model LLM-Research/Phi-4-multimodal-instruct \ 5 | --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \ 6 | --train_type dummy \ 7 | --torch_dtype bfloat16 \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 1 \ 10 | --per_device_eval_batch_size 1 \ 11 | --learning_rate 1e-4 \ 12 | --gradient_accumulation_steps 16 \ 13 | --eval_steps 200 \ 14 | --save_steps 200 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 \ 17 | --max_length 2048 \ 18 | --output_dir output \ 19 | --warmup_ratio 0.05 \ 20 | --dataloader_num_workers 4 21 | -------------------------------------------------------------------------------- /examples/train/predict_with_generate/train.sh: -------------------------------------------------------------------------------- 1 | # 20GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | MAX_PIXELS=1003520 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-VL-7B-Instruct \ 6 | --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \ 7 | --train_type lora \ 8 | --torch_dtype bfloat16 \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 2 \ 12 | --learning_rate 1e-4 \ 13 | --lora_rank 8 \ 14 | --lora_alpha 32 \ 15 | --target_modules all-linear \ 16 | --freeze_vit true \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 100 \ 19 | --save_steps 100 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 \ 26 | --predict_with_generate true \ 27 | --metric_for_best_model rouge-l \ 28 | --greater_is_better true 29 | -------------------------------------------------------------------------------- /examples/train/pretrain/train.sh: -------------------------------------------------------------------------------- 1 | # If not using flash_attn, or transformers<4.44, 2 | # or encountering an abnormally large loss (i.e., the model does not support packing), 3 | # please remove `--packing true`. 4 | nproc_per_node=4 5 | 6 | NPROC_PER_NODE=$nproc_per_node \ 7 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 8 | swift pt \ 9 | --model Qwen/Qwen2.5-7B \ 10 | --train_type full \ 11 | --dataset swift/chinese-c4 \ 12 | --torch_dtype bfloat16 \ 13 | --streaming true \ 14 | --per_device_train_batch_size 1 \ 15 | --per_device_eval_batch_size 1 \ 16 | --learning_rate 1e-5 \ 17 | --gradient_accumulation_steps $(expr 64 / $nproc_per_node) \ 18 | --packing true \ 19 | --eval_steps 500 \ 20 | --save_steps 500 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --deepspeed zero3 \ 24 | --max_length 8192 \ 25 | --max_steps 10000 \ 26 | --warmup_ratio 0.05 \ 27 | --dataloader_num_workers 4 \ 28 | --dataset_num_proc 8 \ 29 | --save_only_model true \ 30 | --output_dir output/Qwen2.5-7B \ 31 | --attn_impl flash_attn 32 | -------------------------------------------------------------------------------- /examples/train/qlora/awq.sh: -------------------------------------------------------------------------------- 1 | # 10GB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct-AWQ \ 5 | --train_type lora \ 6 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 7 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 8 | 'swift/self-cognition#500' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 50 \ 19 | --save_steps 50 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --system 'You are a helpful assistant.' \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --model_author swift \ 28 | --model_name swift-robot 29 | -------------------------------------------------------------------------------- /examples/train/qlora/bnb.sh: -------------------------------------------------------------------------------- 1 | # 10GB 2 | # pip install bitsandbytes 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type lora \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 9 | 'swift/self-cognition#500' \ 10 | --torch_dtype bfloat16 \ 11 | --bnb_4bit_compute_dtype bfloat16 \ 12 | --bnb_4bit_quant_type nf4 \ 13 | --bnb_4bit_use_double_quant true \ 14 | --quant_method bnb \ 15 | --quant_bits 4 \ 16 | --num_train_epochs 1 \ 17 | --per_device_train_batch_size 1 \ 18 | --per_device_eval_batch_size 1 \ 19 | --learning_rate 1e-4 \ 20 | --lora_rank 8 \ 21 | --lora_alpha 32 \ 22 | --target_modules all-linear \ 23 | --gradient_accumulation_steps 16 \ 24 | --eval_steps 50 \ 25 | --save_steps 50 \ 26 | --save_total_limit 2 \ 27 | --logging_steps 5 \ 28 | --max_length 2048 \ 29 | --output_dir output \ 30 | --system 'You are a helpful assistant.' \ 31 | --warmup_ratio 0.05 \ 32 | --dataloader_num_workers 4 \ 33 | --model_author swift \ 34 | --model_name swift-robot 35 | -------------------------------------------------------------------------------- /examples/train/qlora/gptq.sh: -------------------------------------------------------------------------------- 1 | # 2 * 30GiB 2 | CUDA_VISIBLE_DEVICES=0,1 \ 3 | MAX_PIXELS=1003520 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-VL-72B-Instruct-GPTQ-Int4 \ 6 | --dataset 'modelscope/coco_2014_caption:validation#20000' \ 7 | --train_type lora \ 8 | --torch_dtype bfloat16 \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 1 \ 12 | --learning_rate 1e-4 \ 13 | --lora_rank 8 \ 14 | --lora_alpha 32 \ 15 | --target_modules all-linear \ 16 | --freeze_vit true \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 100 \ 19 | --save_steps 100 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 26 | -------------------------------------------------------------------------------- /examples/train/qlora/hqq.sh: -------------------------------------------------------------------------------- 1 | # 10GB 2 | # pip install hqq 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type lora \ 7 | --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ 8 | 'AI-ModelScope/alpaca-gpt4-data-en#500' \ 9 | 'swift/self-cognition#500' \ 10 | --torch_dtype bfloat16 \ 11 | --quant_method hqq \ 12 | --quant_bits 4 \ 13 | --num_train_epochs 1 \ 14 | --per_device_train_batch_size 1 \ 15 | --per_device_eval_batch_size 1 \ 16 | --learning_rate 1e-4 \ 17 | --lora_rank 8 \ 18 | --lora_alpha 32 \ 19 | --target_modules all-linear \ 20 | --gradient_accumulation_steps 16 \ 21 | --eval_steps 50 \ 22 | --save_steps 50 \ 23 | --save_total_limit 2 \ 24 | --logging_steps 5 \ 25 | --max_length 2048 \ 26 | --output_dir output \ 27 | --system 'You are a helpful assistant.' \ 28 | --warmup_ratio 0.05 \ 29 | --dataloader_num_workers 4 \ 30 | --model_author swift \ 31 | --model_name swift-robot 32 | -------------------------------------------------------------------------------- /examples/train/rlhf/README.md: -------------------------------------------------------------------------------- 1 | # TIPS 2 | 3 | Multi-modal models' RLHF are also supported! Check the multimodal folder for details. 4 | -------------------------------------------------------------------------------- /examples/train/rlhf/cpo.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=2 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | NPROC_PER_NODE=$nproc_per_node \ 5 | swift rlhf \ 6 | --rlhf_type cpo \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --deepspeed zero2 \ 28 | --dataset_num_proc 4 29 | -------------------------------------------------------------------------------- /examples/train/rlhf/dpo/full.sh: -------------------------------------------------------------------------------- 1 | # 4 * 50GiB 2 | NPROC_PER_NODE=4 \ 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ 4 | swift rlhf \ 5 | --rlhf_type dpo \ 6 | --model Qwen/Qwen2.5-7B-Instruct \ 7 | --train_type full \ 8 | --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-5 \ 14 | --gradient_accumulation_steps 4 \ 15 | --eval_steps 100 \ 16 | --save_steps 100 \ 17 | --save_total_limit 2 \ 18 | --logging_steps 5 \ 19 | --max_length 8192 \ 20 | --output_dir output \ 21 | --warmup_ratio 0.05 \ 22 | --save_only_model true \ 23 | --dataloader_num_workers 4 \ 24 | --dataset_num_proc 4 \ 25 | --deepspeed zero3 \ 26 | --attn_impl flash_attn 27 | -------------------------------------------------------------------------------- /examples/train/rlhf/dpo/lora.sh: -------------------------------------------------------------------------------- 1 | # 24GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift rlhf \ 4 | --rlhf_type dpo \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type lora \ 7 | --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \ 8 | --torch_dtype bfloat16 \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 1 \ 12 | --learning_rate 1e-4 \ 13 | --lora_rank 8 \ 14 | --lora_alpha 32 \ 15 | --target_modules all-linear \ 16 | --gradient_accumulation_steps 16 \ 17 | --eval_steps 100 \ 18 | --save_steps 100 \ 19 | --save_total_limit 2 \ 20 | --logging_steps 5 \ 21 | --max_length 2048 \ 22 | --output_dir output \ 23 | --warmup_ratio 0.05 \ 24 | --dataloader_num_workers 4 \ 25 | --dataset_num_proc 4 26 | -------------------------------------------------------------------------------- /examples/train/rlhf/kto.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=2 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | NPROC_PER_NODE=$nproc_per_node \ 5 | swift rlhf \ 6 | --rlhf_type kto \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 18 | --eval_steps 100 \ 19 | --save_steps 100 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 \ 26 | --deepspeed zero2 \ 27 | --dataset_num_proc 4 28 | -------------------------------------------------------------------------------- /examples/train/rlhf/orpo.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=2 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | NPROC_PER_NODE=$nproc_per_node \ 5 | swift rlhf \ 6 | --rlhf_type orpo \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --deepspeed zero2 \ 28 | --dataset_num_proc 4 29 | -------------------------------------------------------------------------------- /examples/train/rlhf/rm.sh: -------------------------------------------------------------------------------- 1 | nproc_per_node=2 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | NPROC_PER_NODE=$nproc_per_node \ 5 | swift rlhf \ 6 | --rlhf_type rm \ 7 | --model Qwen/Qwen2.5-7B-Instruct \ 8 | --train_type lora \ 9 | --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 19 | --eval_steps 100 \ 20 | --save_steps 100 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --deepspeed zero2 \ 28 | --dataset_num_proc 4 29 | -------------------------------------------------------------------------------- /examples/train/rlhf/simpo.sh: -------------------------------------------------------------------------------- 1 | # 2*50GB 2 | nproc_per_node=2 3 | 4 | CUDA_VISIBLE_DEVICES=0,1 \ 5 | NPROC_PER_NODE=$nproc_per_node \ 6 | swift rlhf \ 7 | --rlhf_type simpo \ 8 | --model Qwen/Qwen2.5-3B-Instruct \ 9 | --train_type full \ 10 | --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \ 11 | --torch_dtype bfloat16 \ 12 | --num_train_epochs 1 \ 13 | --per_device_train_batch_size 1 \ 14 | --per_device_eval_batch_size 1 \ 15 | --learning_rate 1e-5 \ 16 | --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ 17 | --eval_steps 100 \ 18 | --save_steps 100 \ 19 | --save_total_limit 2 \ 20 | --logging_steps 5 \ 21 | --max_length 2048 \ 22 | --output_dir output \ 23 | --warmup_ratio 0.05 \ 24 | --dataloader_num_workers 4 \ 25 | --deepspeed zero2 \ 26 | --dataset_num_proc 4 27 | -------------------------------------------------------------------------------- /examples/train/seq_cls/bert/deploy.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift deploy \ 3 | --adapters output/vx-xxx/checkpoint-xxx \ 4 | --served_model_name bert-base-chinese \ 5 | --truncation_strategy right \ 6 | --max_length 512 7 | 8 | # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ 9 | # "model": "bert-base-chinese", 10 | # "messages": [{"role": "user", "content": "包装差,容易被调包。"}] 11 | # }' 12 | -------------------------------------------------------------------------------- /examples/train/seq_cls/bert/infer.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift infer \ 3 | --adapters output/vx-xxx/checkpoint-xxx \ 4 | --load_data_args true \ 5 | --max_batch_size 16 \ 6 | --truncation_strategy right \ 7 | --max_length 512 8 | -------------------------------------------------------------------------------- /examples/train/seq_cls/bert/sft.sh: -------------------------------------------------------------------------------- 1 | # If `num_labels` is provided, it will be considered a classification task, 2 | # and AutoModelForSequenceClassification will be used to load the model. 3 | # The BERT model does not require templates, so it can usually be used without registration. 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift sft \ 6 | --model AI-ModelScope/bert-base-chinese \ 7 | --train_type lora \ 8 | --dataset 'DAMO_NLP/jd:cls#2000' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 50 \ 19 | --save_steps 50 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 512 \ 23 | --truncation_strategy right \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --num_labels 2 \ 28 | --task_type seq_cls 29 | -------------------------------------------------------------------------------- /examples/train/seq_cls/multi_label/sft.sh: -------------------------------------------------------------------------------- 1 | # Custom dataset format reference: https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-0.5B \ 5 | --train_type lora \ 6 | --dataset '' \ 7 | --torch_dtype bfloat16 \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 16 \ 10 | --per_device_eval_batch_size 16 \ 11 | --learning_rate 1e-4 \ 12 | --lora_rank 8 \ 13 | --lora_alpha 32 \ 14 | --target_modules all-linear \ 15 | --gradient_accumulation_steps 1 \ 16 | --eval_steps 100 \ 17 | --save_steps 100 \ 18 | --save_total_limit 2 \ 19 | --logging_steps 5 \ 20 | --max_length 2048 \ 21 | --output_dir output \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 4 \ 24 | --dataset_num_proc 4 \ 25 | --num_labels '' \ 26 | --task_type seq_cls \ 27 | --use_chat_template false \ 28 | --problem_type multi_label_classification 29 | -------------------------------------------------------------------------------- /examples/train/seq_cls/qwen2_5/deploy.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift deploy \ 3 | --adapters output/vx-xxx/checkpoint-xxx 4 | 5 | # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ 6 | # "model": "Qwen2.5-0.5B", 7 | # "messages": [{"role": "user", "content": "包装差,容易被调包。"}] 8 | # }' 9 | -------------------------------------------------------------------------------- /examples/train/seq_cls/qwen2_5/infer.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift infer \ 3 | --adapters output/vx-xxx/checkpoint-xxx \ 4 | --load_data_args true \ 5 | --max_batch_size 16 6 | -------------------------------------------------------------------------------- /examples/train/seq_cls/qwen2_5/sft.sh: -------------------------------------------------------------------------------- 1 | # If `num_labels` is provided, it will be considered a classification task, 2 | # and AutoModelForSequenceClassification will be used to load the model. 3 | # You can also specify `--model Qwen/Qwen2.5-0.5B-Instruct --use_chat_template true`. 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift sft \ 6 | --model Qwen/Qwen2.5-0.5B \ 7 | --train_type lora \ 8 | --dataset 'DAMO_NLP/jd:cls#2000' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 50 \ 19 | --save_steps 50 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 \ 26 | --num_labels 2 \ 27 | --task_type seq_cls \ 28 | --use_chat_template false 29 | -------------------------------------------------------------------------------- /examples/train/seq_cls/qwen2_vl/infer.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | MAX_PIXELS=1003520 \ 3 | swift infer \ 4 | --adapters output/vx-xxx/checkpoint-xxx \ 5 | --load_data_args true 6 | -------------------------------------------------------------------------------- /examples/train/seq_cls/qwen2_vl/sft.sh: -------------------------------------------------------------------------------- 1 | # If `num_labels` is provided, it will be considered a classification task. 2 | # You can also specify `--model Qwen/Qwen2.5-VL-2B-Instruct --use_chat_template true`. 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | MAX_PIXELS=1003520 \ 5 | swift sft \ 6 | --model Qwen/Qwen2-VL-2B \ 7 | --train_type lora \ 8 | --dataset 'tany0699/garbage265#20000' \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --gradient_accumulation_steps 16 \ 18 | --eval_steps 50 \ 19 | --save_steps 50 \ 20 | --save_total_limit 2 \ 21 | --logging_steps 5 \ 22 | --max_length 2048 \ 23 | --output_dir output \ 24 | --warmup_ratio 0.05 \ 25 | --dataloader_num_workers 4 \ 26 | --num_labels 265 \ 27 | --task_type seq_cls \ 28 | --use_chat_template false 29 | -------------------------------------------------------------------------------- /examples/train/seq_cls/regression/deploy.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift deploy \ 3 | --adapters output/vx-xxx/checkpoint-xxx 4 | 5 | # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ 6 | # "model": "Qwen2.5-0.5B", 7 | # "messages": [{"role": "user", "content": "Task: Based on the given two sentences, provide a similarity score between 0.0 and 1.0.\nSentence 1: The animal is eating.\nSentence 2: A woman is dancing.\nSimilarity score: "}] 8 | # }' 9 | -------------------------------------------------------------------------------- /examples/train/seq_cls/regression/infer.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift infer \ 3 | --adapters output/vx-xxx/checkpoint-xxx \ 4 | --load_data_args true \ 5 | --max_batch_size 16 6 | -------------------------------------------------------------------------------- /examples/train/seq_cls/regression/sft.sh: -------------------------------------------------------------------------------- 1 | # 2GB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-0.5B \ 5 | --train_type lora \ 6 | --dataset 'sentence-transformers/stsb:reg#20000' \ 7 | --torch_dtype bfloat16 \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 16 \ 10 | --per_device_eval_batch_size 16 \ 11 | --learning_rate 1e-4 \ 12 | --lora_rank 8 \ 13 | --lora_alpha 32 \ 14 | --target_modules all-linear \ 15 | --gradient_accumulation_steps 1 \ 16 | --eval_steps 100 \ 17 | --save_steps 100 \ 18 | --save_total_limit 2 \ 19 | --logging_steps 5 \ 20 | --max_length 2048 \ 21 | --output_dir output \ 22 | --warmup_ratio 0.05 \ 23 | --dataloader_num_workers 4 \ 24 | --dataset_num_proc 4 \ 25 | --num_labels 1 \ 26 | --task_type seq_cls \ 27 | --use_chat_template false \ 28 | --problem_type regression 29 | -------------------------------------------------------------------------------- /examples/train/streaming/train.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model Qwen/Qwen2.5-7B-Instruct \ 4 | --train_type lora \ 5 | --dataset 'swift/self-cognition#1000' \ 6 | --streaming true \ 7 | --max_steps 1000 \ 8 | --learning_rate 1e-4 \ 9 | --lora_rank 8 \ 10 | --lora_alpha 32 \ 11 | --gradient_accumulation_steps 16 \ 12 | --eval_steps 100 \ 13 | --save_steps 100 \ 14 | --save_total_limit 2 \ 15 | --logging_steps 5 \ 16 | --model_author swift \ 17 | --model_name swift-robot 18 | -------------------------------------------------------------------------------- /examples/train/think_model/deepseek_r1.sh: -------------------------------------------------------------------------------- 1 | # 18GB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model deepseek-ai/DeepSeek-R1-0528-Qwen3-8B \ 5 | --train_type lora \ 6 | --dataset 'swift/DeepSeek-R1-Qwen3-8B-Distill#1800' \ 7 | 'swift/self-cognition:empty_think#600' \ 8 | --loss_scale ignore_empty_think \ 9 | --torch_dtype bfloat16 \ 10 | --num_train_epochs 1 \ 11 | --per_device_train_batch_size 1 \ 12 | --per_device_eval_batch_size 1 \ 13 | --learning_rate 1e-4 \ 14 | --lora_rank 8 \ 15 | --lora_alpha 32 \ 16 | --target_modules all-linear \ 17 | --gradient_accumulation_steps 16 \ 18 | --load_from_cache_file false \ 19 | --eval_steps 50 \ 20 | --save_steps 50 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --use_liger_kernel true \ 28 | --model_author swift \ 29 | --model_name swift-robot 30 | -------------------------------------------------------------------------------- /examples/train/think_model/qwen3_demo1.sh: -------------------------------------------------------------------------------- 1 | # use `--loss_scale ignore_empty_think` 2 | # Avoid losing the think capability by ignoring the loss of empty `\n\n\n\n` 3 | # This method is also applicable to the Deepseek-R1 series of models. 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift sft \ 6 | --model Qwen/Qwen3-8B \ 7 | --train_type lora \ 8 | --dataset 'swift/Qwen3-SFT-Mixin#2000' \ 9 | 'swift/self-cognition:empty_think#600' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps 16 \ 19 | --eval_steps 50 \ 20 | --save_steps 50 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --use_liger_kernel true \ 28 | --load_from_cache_file false \ 29 | --loss_scale ignore_empty_think \ 30 | --model_author swift \ 31 | --model_name swift-robot 32 | -------------------------------------------------------------------------------- /examples/train/think_model/qwen3_demo2.sh: -------------------------------------------------------------------------------- 1 | # use `swift/self-cognition:qwen3` 2 | # Avoid losing the thinking capability by appending `/no_think` to the dataset query. 3 | # https://github.com/modelscope/ms-swift/blob/77985c2ccdac8ed4037174ee222e79d1f1d5059d/swift/llm/dataset/dataset/llm.py#L835 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | swift sft \ 6 | --model Qwen/Qwen3-8B \ 7 | --train_type lora \ 8 | --dataset 'swift/Qwen3-SFT-Mixin#2000' \ 9 | 'swift/self-cognition:qwen3#600' \ 10 | --torch_dtype bfloat16 \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --learning_rate 1e-4 \ 15 | --lora_rank 8 \ 16 | --lora_alpha 32 \ 17 | --target_modules all-linear \ 18 | --gradient_accumulation_steps 16 \ 19 | --eval_steps 50 \ 20 | --save_steps 50 \ 21 | --save_total_limit 2 \ 22 | --logging_steps 5 \ 23 | --max_length 2048 \ 24 | --output_dir output \ 25 | --warmup_ratio 0.05 \ 26 | --dataloader_num_workers 4 \ 27 | --use_liger_kernel true \ 28 | --load_from_cache_file false \ 29 | --model_author swift \ 30 | --model_name swift-robot 31 | -------------------------------------------------------------------------------- /examples/train/tuners/adalora/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type adalora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --gradient_accumulation_steps 16 \ 11 | --eval_steps 100 \ 12 | --save_steps 100 \ 13 | --save_total_limit 2 \ 14 | --logging_steps 5 \ 15 | --model_author swift \ 16 | --model_name swift-robot 17 | -------------------------------------------------------------------------------- /examples/train/tuners/adapter/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type adapter \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --gradient_accumulation_steps 16 \ 11 | --eval_steps 100 \ 12 | --save_steps 100 \ 13 | --save_total_limit 2 \ 14 | --logging_steps 5 \ 15 | --model_author swift \ 16 | --model_name swift-robot 17 | -------------------------------------------------------------------------------- /examples/train/tuners/boft/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type boft \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --gradient_accumulation_steps 16 \ 11 | --eval_steps 100 \ 12 | --save_steps 100 \ 13 | --save_total_limit 2 \ 14 | --logging_steps 5 \ 15 | --model_author swift \ 16 | --model_name swift-robot 17 | -------------------------------------------------------------------------------- /examples/train/tuners/bone/train.sh: -------------------------------------------------------------------------------- 1 | # 17.3GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type bone \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --gradient_accumulation_steps 16 \ 11 | --eval_steps 100 \ 12 | --save_steps 100 \ 13 | --save_total_limit 2 \ 14 | --logging_steps 5 \ 15 | --model_author swift \ 16 | --model_name swift-robot 17 | -------------------------------------------------------------------------------- /examples/train/tuners/dora/train.sh: -------------------------------------------------------------------------------- 1 | # 17.2GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --use_dora true \ 7 | --dataset 'swift/self-cognition#1000' \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 1 \ 10 | --learning_rate 1e-4 \ 11 | --lora_rank 8 \ 12 | --lora_alpha 32 \ 13 | --gradient_accumulation_steps 16 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /examples/train/tuners/galore/train_galore.sh: -------------------------------------------------------------------------------- 1 | # 38GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type full \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-5 \ 10 | --gradient_accumulation_steps 16 \ 11 | --eval_steps 100 \ 12 | --save_steps 100 \ 13 | --save_total_limit 2 \ 14 | --logging_steps 5 \ 15 | --model_author swift \ 16 | --model_name swift-robot \ 17 | --use_galore true \ 18 | --galore_optim_per_parameter true 19 | -------------------------------------------------------------------------------- /examples/train/tuners/galore/train_qgalore.sh: -------------------------------------------------------------------------------- 1 | # 35GiB 2 | # pip install bitsandbytes==0.40.0 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | swift sft \ 5 | --model Qwen/Qwen2.5-7B-Instruct \ 6 | --train_type full \ 7 | --torch_dtype bfloat16 \ 8 | --dataset 'lvjianjin/AdvertiseGen#1000' \ 9 | --num_train_epochs 1 \ 10 | --per_device_train_batch_size 1 \ 11 | --learning_rate 1e-5 \ 12 | --gradient_accumulation_steps 16 \ 13 | --eval_steps 100 \ 14 | --save_steps 100 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 \ 17 | --model_author swift \ 18 | --model_name swift-robot \ 19 | --use_galore true \ 20 | --galore_quantization true 21 | -------------------------------------------------------------------------------- /examples/train/tuners/lisa/train.sh: -------------------------------------------------------------------------------- 1 | # 29GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type full \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --lisa_activated_layers 2 \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 1 \ 10 | --learning_rate 1e-5 \ 11 | --gradient_accumulation_steps 16 \ 12 | --eval_steps 100 \ 13 | --save_steps 100 \ 14 | --save_total_limit 2 \ 15 | --logging_steps 5 \ 16 | --model_author swift \ 17 | --model_name swift-robot 18 | -------------------------------------------------------------------------------- /examples/train/tuners/llamapro/train.sh: -------------------------------------------------------------------------------- 1 | # 25.4GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type llamapro \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --llamapro_num_new_blocks 4 \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 1 \ 10 | --learning_rate 1e-4 \ 11 | --gradient_accumulation_steps 16 \ 12 | --eval_steps 100 \ 13 | --save_steps 100 \ 14 | --save_total_limit 2 \ 15 | --logging_steps 5 \ 16 | --model_author swift \ 17 | --model_name swift-robot 18 | -------------------------------------------------------------------------------- /examples/train/tuners/longlora/train.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model LLM-Research/Meta-Llama-3.1-8B-Instruct \ 4 | --train_type longlora \ 5 | --dataset 'AI-ModelScope/LongAlpaca-12k#1000' \ 6 | --num_train_epochs 1 \ 7 | --learning_rate 1e-4 \ 8 | --attn_impl flash_attn \ 9 | --gradient_accumulation_steps 16 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --eval_steps 100 \ 13 | --save_steps 100 \ 14 | --max_length 10000 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 17 | -------------------------------------------------------------------------------- /examples/train/tuners/lora-ga/train.sh: -------------------------------------------------------------------------------- 1 | # Train 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2-1.5B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --init_weights lora-ga \ 13 | --lora_ga_batch_size 2 \ 14 | --lora_ga_iters 2 \ 15 | --lora_ga_max_length 1024 \ 16 | --lora_ga_direction ArB2r \ 17 | --lora_ga_scale stable \ 18 | --lora_ga_stable_gamma 16 \ 19 | --gradient_accumulation_steps 16 \ 20 | --eval_steps 100 \ 21 | --save_steps 100 \ 22 | --save_total_limit 2 \ 23 | --logging_steps 5 \ 24 | --model_author swift \ 25 | --model_name swift-robot 26 | 27 | # Infer 28 | # swift infer \ 29 | # --model Qwen/Qwen2-1.5B-Instruct \ 30 | # --ckpt_dir ./output/Qwen2-1.5B-Instruct/v0-20241214-191235/checkpoint-62/converted/default \ 31 | # --infer_backend pt \ 32 | # --stream true \ 33 | # --max_new_tokens 2048 34 | -------------------------------------------------------------------------------- /examples/train/tuners/lora/train.sh: -------------------------------------------------------------------------------- 1 | # 17.2GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --gradient_accumulation_steps 16 \ 13 | --eval_steps 100 \ 14 | --save_steps 100 \ 15 | --save_total_limit 2 \ 16 | --logging_steps 5 \ 17 | --model_author swift \ 18 | --model_name swift-robot 19 | -------------------------------------------------------------------------------- /examples/train/tuners/neftune/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --neftune_noise_alpha 15 \ 10 | --learning_rate 1e-4 \ 11 | --lora_rank 8 \ 12 | --lora_alpha 32 \ 13 | --gradient_accumulation_steps 16 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /examples/train/tuners/olora/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --init_lora_weights olora \ 13 | --gradient_accumulation_steps 16 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /examples/train/tuners/pissa/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --train_type lora \ 6 | --dataset 'swift/self-cognition#1000' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --lora_rank 8 \ 11 | --lora_alpha 32 \ 12 | --init_lora_weights pissa \ 13 | --gradient_accumulation_steps 16 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /examples/train/tuners/qlora/train.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model Qwen/Qwen2.5-7B-Instruct \ 4 | --train_type lora \ 5 | --dataset 'swift/self-cognition#1000' \ 6 | --num_train_epochs 1 \ 7 | --per_device_train_batch_size 1 \ 8 | --learning_rate 1e-4 \ 9 | --lora_rank 8 \ 10 | --lora_alpha 32 \ 11 | --gradient_accumulation_steps 16 \ 12 | --eval_steps 100 \ 13 | --save_steps 100 \ 14 | --save_total_limit 2 \ 15 | --logging_steps 5 \ 16 | --model_author swift \ 17 | --model_name swift-robot \ 18 | --quant_bits 4 \ 19 | --quant_method bnb 20 | -------------------------------------------------------------------------------- /examples/train/tuners/reft/train.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | swift sft \ 3 | --model Qwen/Qwen2.5-7B-Instruct \ 4 | --train_type reft \ 5 | --dataset 'swift/self-cognition#1000' \ 6 | --reft_intervention_type 'LoreftIntervention' \ 7 | --num_train_epochs 1 \ 8 | --per_device_train_batch_size 1 \ 9 | --learning_rate 1e-4 \ 10 | --gradient_checkpointing false \ 11 | --gradient_accumulation_steps 16 \ 12 | --eval_steps 100 \ 13 | --save_steps 100 \ 14 | --save_total_limit 2 \ 15 | --logging_steps 5 \ 16 | --model_author swift \ 17 | --model_name swift-robot 18 | -------------------------------------------------------------------------------- /examples/train/tuners/unsloth/train.sh: -------------------------------------------------------------------------------- 1 | # 17GiB 2 | CUDA_VISIBLE_DEVICES=0 \ 3 | swift sft \ 4 | --model Qwen/Qwen2.5-7B-Instruct \ 5 | --tuner_backend unsloth \ 6 | --train_type lora \ 7 | --dataset 'swift/self-cognition#1000' \ 8 | --num_train_epochs 1 \ 9 | --per_device_train_batch_size 1 \ 10 | --learning_rate 1e-4 \ 11 | --lora_rank 8 \ 12 | --lora_alpha 32 \ 13 | --gradient_accumulation_steps 16 \ 14 | --eval_steps 100 \ 15 | --save_steps 100 \ 16 | --save_total_limit 2 \ 17 | --logging_steps 5 \ 18 | --model_author swift \ 19 | --model_name swift-robot 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/framework.txt 2 | -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | docutils>=0.16.0 2 | myst_parser 3 | recommonmark 4 | sphinx>=5.3.0 5 | sphinx-book-theme 6 | sphinx-copybutton 7 | sphinx-rtd-theme 8 | sphinx_markdown_tables 9 | -------------------------------------------------------------------------------- /requirements/eval.txt: -------------------------------------------------------------------------------- 1 | evalscope[opencompass] 2 | evalscope[vlmeval] 3 | -------------------------------------------------------------------------------- /requirements/framework.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | addict 3 | aiohttp 4 | attrdict 5 | binpacking 6 | charset_normalizer 7 | cpm_kernels 8 | dacite 9 | datasets>=3.0,<3.4 10 | einops 11 | fastapi 12 | gradio>=3.40.0 13 | importlib_metadata 14 | jieba 15 | matplotlib 16 | modelscope>=1.23 17 | nltk 18 | numpy<2.0 19 | openai 20 | oss2 21 | pandas 22 | peft>=0.11,<0.16 23 | pillow 24 | requests 25 | rouge 26 | safetensors 27 | scipy 28 | sentencepiece 29 | simplejson>=3.3.0 30 | sortedcontainers>=1.5.9 31 | tensorboard 32 | tiktoken 33 | tqdm 34 | transformers>=4.33,<4.53 35 | transformers_stream_generator 36 | trl>=0.15,<0.20 37 | uvicorn 38 | zstandard 39 | -------------------------------------------------------------------------------- /requirements/install_all.sh: -------------------------------------------------------------------------------- 1 | # please use python=3.10, cuda12.* 2 | # sh requirements/install_all.sh 3 | pip install "vllm>=0.5.1,<0.9" -U 4 | pip install "lmdeploy>=0.5" -U --no-deps 5 | pip install autoawq -U --no-deps 6 | pip install auto_gptq optimum bitsandbytes -U 7 | pip install git+https://github.com/modelscope/ms-swift.git 8 | pip install timm -U 9 | pip install deepspeed -U 10 | pip install qwen_vl_utils qwen_omni_utils decord librosa icecream soundfile -U 11 | pip install liger_kernel nvitop pre-commit -U 12 | # flash-attn: https://github.com/Dao-AILab/flash-attention/releases 13 | -------------------------------------------------------------------------------- /requirements/seq_parallel.txt: -------------------------------------------------------------------------------- 1 | xtuner 2 | -------------------------------------------------------------------------------- /requirements/swanlab.txt: -------------------------------------------------------------------------------- 1 | swanlab 2 | -------------------------------------------------------------------------------- /requirements/tests.txt: -------------------------------------------------------------------------------- 1 | expecttest 2 | flake8 3 | isort>=4.3.21 4 | modelscope 5 | pre-commit 6 | yapf==0.30.0 # use fix version to ensure consistent auto-styling 7 | -------------------------------------------------------------------------------- /scripts/utils/plot_loss.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from swift.utils import plot_images 4 | 5 | ckpt_dir = 'output/xxx/vx-xxx' 6 | if __name__ == '__main__': 7 | images_dir = os.path.join(ckpt_dir, 'images') 8 | tb_dir = os.path.join(ckpt_dir, 'runs') 9 | plot_images(images_dir, tb_dir, ['train/loss'], 0.9) 10 | -------------------------------------------------------------------------------- /scripts/utils/run_template.py: -------------------------------------------------------------------------------- 1 | from swift.llm import TemplateType 2 | 3 | if __name__ == '__main__': 4 | template_name_list = TemplateType.get_template_name_list() 5 | tn_gen = ', '.join([tn for tn in template_name_list if 'generation' in tn]) 6 | tn_chat = ', '.join([tn for tn in template_name_list if 'generation' not in tn]) 7 | print(f'Text Generation: {tn_gen}') 8 | print(f'Chat: {tn_chat}') 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 120 3 | multi_line_output = 0 4 | known_standard_library = setuptools 5 | known_first_party = swift 6 | known_third_party = json,yaml 7 | no_lines_before = STDLIB,LOCALFOLDER 8 | default_section = THIRDPARTY 9 | 10 | [yapf] 11 | BASED_ON_STYLE = pep8 12 | COLUMN_LIMIT = 120 13 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 14 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 15 | SPLIT_BEFORE_ARITHMETIC_OPERATOR = true 16 | 17 | [codespell] 18 | skip = *.ipynb 19 | quiet-level = 3 20 | ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids 21 | 22 | [flake8] 23 | max-line-length = 120 24 | select = B,C,E,F,P,T4,W,B9 25 | ignore = F401,F403,F405,F821,W503,E251,W504,E126 26 | exclude = docs/src,*.pyi,.git,peft.py 27 | 28 | [darglint] 29 | ignore=DAR101 30 | 31 | [easy_install] 32 | index-url=https://pypi.tuna.tsinghua.edu.cn/simple 33 | -------------------------------------------------------------------------------- /swift/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/swift/cli/__init__.py -------------------------------------------------------------------------------- /swift/cli/_megatron/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/swift/cli/_megatron/__init__.py -------------------------------------------------------------------------------- /swift/cli/_megatron/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from typing import Dict 3 | 4 | from swift.utils import get_logger 5 | from ..main import cli_main as swift_cli_main 6 | 7 | logger = get_logger() 8 | 9 | ROUTE_MAPPING: Dict[str, str] = { 10 | 'pt': 'swift.cli._megatron.pt', 11 | 'sft': 'swift.cli._megatron.sft', 12 | } 13 | 14 | 15 | def cli_main(): 16 | return swift_cli_main(ROUTE_MAPPING) 17 | 18 | 19 | if __name__ == '__main__': 20 | cli_main() 21 | -------------------------------------------------------------------------------- /swift/cli/_megatron/pt.py: -------------------------------------------------------------------------------- 1 | from swift.megatron import megatron_pt_main 2 | 3 | if __name__ == '__main__': 4 | megatron_pt_main() 5 | -------------------------------------------------------------------------------- /swift/cli/_megatron/sft.py: -------------------------------------------------------------------------------- 1 | from swift.megatron import megatron_sft_main 2 | 3 | if __name__ == '__main__': 4 | megatron_sft_main() 5 | -------------------------------------------------------------------------------- /swift/cli/app.py: -------------------------------------------------------------------------------- 1 | from swift.llm import app_main 2 | 3 | if __name__ == '__main__': 4 | app_main() 5 | -------------------------------------------------------------------------------- /swift/cli/deploy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import deploy_main 3 | 4 | if __name__ == '__main__': 5 | deploy_main() 6 | -------------------------------------------------------------------------------- /swift/cli/eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import eval_main 3 | 4 | if __name__ == '__main__': 5 | eval_main() 6 | -------------------------------------------------------------------------------- /swift/cli/export.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import export_main 3 | 4 | if __name__ == '__main__': 5 | export_main() 6 | -------------------------------------------------------------------------------- /swift/cli/infer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import infer_main 3 | 4 | if __name__ == '__main__': 5 | infer_main() 6 | -------------------------------------------------------------------------------- /swift/cli/merge_lora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import ExportArguments, SwiftPipeline, merge_lora 3 | 4 | 5 | class SwiftMergeLoRA(SwiftPipeline): 6 | args_class = ExportArguments 7 | args: args_class 8 | 9 | def run(self): 10 | merge_lora(self.args) 11 | 12 | 13 | if __name__ == '__main__': 14 | SwiftMergeLoRA().main() 15 | -------------------------------------------------------------------------------- /swift/cli/pt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import pt_main 3 | 4 | if __name__ == '__main__': 5 | pt_main() 6 | -------------------------------------------------------------------------------- /swift/cli/rlhf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import rlhf_main 3 | 4 | if __name__ == '__main__': 5 | rlhf_main() 6 | -------------------------------------------------------------------------------- /swift/cli/rollout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm import rollout_main 3 | 4 | if __name__ == '__main__': 5 | rollout_main() 6 | -------------------------------------------------------------------------------- /swift/cli/sample.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.llm.sampling import sampling_main 3 | 4 | if __name__ == '__main__': 5 | sampling_main() 6 | -------------------------------------------------------------------------------- /swift/cli/sft.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | import os 3 | 4 | from swift.llm import sft_main 5 | 6 | if __name__ == '__main__': 7 | sft_main() 8 | -------------------------------------------------------------------------------- /swift/cli/web_ui.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.ui import webui_main 3 | 4 | if __name__ == '__main__': 5 | webui_main() 6 | -------------------------------------------------------------------------------- /swift/hub/__init__.py: -------------------------------------------------------------------------------- 1 | from .hub import HFHub, MSHub, get_hub 2 | -------------------------------------------------------------------------------- /swift/hub/constant.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from modelscope.hub import constants 3 | 4 | constants.API_HTTP_CLIENT_TIMEOUT = 5 5 | constants.API_FILE_DOWNLOAD_TIMEOUT = 300 6 | constants.API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 16 7 | -------------------------------------------------------------------------------- /swift/llm/app/__init__.py: -------------------------------------------------------------------------------- 1 | from .app import SwiftApp, app_main 2 | -------------------------------------------------------------------------------- /swift/llm/app/locale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | locale_mapping = { 3 | 'modify_system': { 4 | 'en': '🛠️ Set system and clear history', 5 | 'zh': '🛠️ 设置system并清空历史' 6 | }, 7 | 'clear_history': { 8 | 'en': '🧹 Clear history', 9 | 'zh': '🧹 清空历史' 10 | }, 11 | 'submit': { 12 | 'en': '🚀 Send', 13 | 'zh': '🚀 发送' 14 | }, 15 | 'regenerate': { 16 | 'en': '🤔️ Regenerate', 17 | 'zh': '🤔️ 重试' 18 | }, 19 | 'upload': { 20 | 'en': '📁 Upload', 21 | 'zh': '📁 上传' 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /swift/llm/argument/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .app_args import AppArguments 3 | from .base_args import BaseArguments 4 | from .deploy_args import DeployArguments 5 | from .eval_args import EvalArguments 6 | from .export_args import ExportArguments 7 | from .infer_args import InferArguments 8 | from .rlhf_args import RLHFArguments 9 | from .sampling_args import SamplingArguments 10 | from .train_args import TrainArguments 11 | from .tuner_args import TunerArguments 12 | from .webui_args import WebUIArguments 13 | -------------------------------------------------------------------------------- /swift/llm/argument/base_args/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .base_args import BaseArguments 3 | from .utils import to_abspath 4 | -------------------------------------------------------------------------------- /swift/llm/argument/base_args/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | import os 3 | from typing import List, Union 4 | 5 | 6 | def to_abspath(path: Union[str, List[str], None], check_path_exist: bool = False) -> Union[str, List[str], None]: 7 | """Check the path for validity and convert it to an absolute path. 8 | 9 | Args: 10 | path: The path to be checked/converted 11 | check_path_exist: Whether to check if the path exists 12 | 13 | Returns: 14 | Absolute path 15 | """ 16 | if path is None: 17 | return 18 | elif isinstance(path, str): 19 | # Remove user path prefix and convert to absolute path. 20 | path = os.path.abspath(os.path.expanduser(path)) 21 | if check_path_exist and not os.path.exists(path): 22 | raise FileNotFoundError(f"path: '{path}'") 23 | return path 24 | assert isinstance(path, list), f'path: {path}' 25 | res = [] 26 | for v in path: 27 | res.append(to_abspath(v, check_path_exist)) 28 | return res 29 | -------------------------------------------------------------------------------- /swift/llm/argument/merge_args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from dataclasses import dataclass 3 | 4 | from swift.utils import get_logger 5 | 6 | logger = get_logger() 7 | 8 | 9 | @dataclass 10 | class MergeArguments: 11 | """ 12 | MergeArguments is a dataclass that holds configuration for merging models. 13 | 14 | Args: 15 | merge_lora (bool): Flag to indicate if LoRA merging is enabled. Default is False. 16 | safe_serialization(bool): Use safetensors or not, default `True`. 17 | max_shard_size(str): The max size of single shard file. 18 | """ 19 | merge_lora: bool = False 20 | safe_serialization: bool = True 21 | max_shard_size: str = '5GB' 22 | -------------------------------------------------------------------------------- /swift/llm/argument/webui_args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class WebUIArguments: 7 | """ 8 | Args: 9 | server_name (str): The hostname or IP address to bind the web UI server to. Default is '0.0.0.0'. 10 | server_port (int): The port number to bind the web UI server to. Default is 7860. 11 | share (bool): A flag indicating whether to share the web UI publicly. Default is False. 12 | lang (str): The language setting for the web UI. Default is 'zh'. 13 | """ 14 | server_name: str = '0.0.0.0' 15 | server_port: int = 7860 16 | share: bool = False 17 | lang: str = 'zh' 18 | -------------------------------------------------------------------------------- /swift/llm/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | import datasets.fingerprint 3 | from datasets import Dataset as HfDataset 4 | 5 | from ..utils import get_temporary_cache_files_directory 6 | from . import dataset 7 | from .loader import DATASET_TYPE, load_dataset 8 | from .media import MediaResource 9 | from .preprocessor import (AlpacaPreprocessor, AutoPreprocessor, MessagesPreprocessor, ResponsePreprocessor, 10 | RowPreprocessor) 11 | from .register import DATASET_MAPPING, DatasetMeta, SubsetDataset, register_dataset, register_dataset_info 12 | from .utils import (EncodePreprocessor, GetLengthPreprocessor, IterablePackingDataset, LazyLLMDataset, PackingDataset, 13 | sample_dataset) 14 | 15 | datasets.fingerprint.get_temporary_cache_files_directory = get_temporary_cache_files_directory 16 | datasets.arrow_dataset.get_temporary_cache_files_directory = get_temporary_cache_files_directory 17 | register_dataset_info() 18 | -------------------------------------------------------------------------------- /swift/llm/dataset/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from . import llm, mllm 3 | -------------------------------------------------------------------------------- /swift/llm/dataset/preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .core import (DATASET_TYPE, AlpacaPreprocessor, AutoPreprocessor, ClsPreprocessor, MessagesPreprocessor, 3 | ResponsePreprocessor, RowPreprocessor) 4 | from .extra import ClsGenerationPreprocessor, GroundingMixin, TextGenerationPreprocessor 5 | -------------------------------------------------------------------------------- /swift/llm/ds_config/zero0.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "zero_optimization": { 16 | "stage": 0, 17 | "allgather_partitions": true, 18 | "allgather_bucket_size": 2e8, 19 | "overlap_comm": false, 20 | "reduce_scatter": true, 21 | "reduce_bucket_size": 2e8, 22 | "contiguous_gradients": true 23 | }, 24 | 25 | "gradient_accumulation_steps": "auto", 26 | "gradient_clipping": "auto", 27 | "steps_per_print": 2000, 28 | "train_batch_size": "auto", 29 | "train_micro_batch_size_per_gpu": "auto", 30 | "wall_clock_breakdown": false 31 | } 32 | -------------------------------------------------------------------------------- /swift/llm/ds_config/zero1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "zero_optimization": { 16 | "stage": 1, 17 | "offload_optimizer": { 18 | "device": "none", 19 | "pin_memory": true 20 | }, 21 | "allgather_partitions": true, 22 | "allgather_bucket_size": 2e8, 23 | "overlap_comm": false, 24 | "reduce_scatter": true, 25 | "reduce_bucket_size": 2e8, 26 | "contiguous_gradients": true 27 | }, 28 | 29 | "gradient_accumulation_steps": "auto", 30 | "gradient_clipping": "auto", 31 | "steps_per_print": 2000, 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto", 34 | "wall_clock_breakdown": false 35 | } 36 | -------------------------------------------------------------------------------- /swift/llm/ds_config/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "zero_optimization": { 16 | "stage": 2, 17 | "offload_optimizer": { 18 | "device": "none", 19 | "pin_memory": true 20 | }, 21 | "allgather_partitions": true, 22 | "allgather_bucket_size": 2e8, 23 | "overlap_comm": false, 24 | "reduce_scatter": true, 25 | "reduce_bucket_size": 2e8, 26 | "contiguous_gradients": true 27 | }, 28 | 29 | "gradient_accumulation_steps": "auto", 30 | "gradient_clipping": "auto", 31 | "steps_per_print": 2000, 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto", 34 | "wall_clock_breakdown": false 35 | } 36 | -------------------------------------------------------------------------------- /swift/llm/ds_config/zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "zero_optimization": { 16 | "stage": 2, 17 | "offload_optimizer": { 18 | "device": "cpu", 19 | "pin_memory": true 20 | }, 21 | "allgather_partitions": true, 22 | "allgather_bucket_size": 2e8, 23 | "overlap_comm": false, 24 | "reduce_scatter": true, 25 | "reduce_bucket_size": 2e8, 26 | "contiguous_gradients": true 27 | }, 28 | 29 | "gradient_accumulation_steps": "auto", 30 | "gradient_clipping": "auto", 31 | "steps_per_print": 2000, 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto", 34 | "wall_clock_breakdown": false 35 | } 36 | -------------------------------------------------------------------------------- /swift/llm/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .eval import SwiftEval, eval_main 3 | -------------------------------------------------------------------------------- /swift/llm/export/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .export import SwiftExport, export_main 3 | from .merge_lora import merge_lora 4 | from .ollama import export_to_ollama 5 | from .quant import quantize_model 6 | -------------------------------------------------------------------------------- /swift/llm/infer/infer_engine/patch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from contextlib import contextmanager 3 | from functools import wraps 4 | 5 | from transformers import AutoConfig, AutoTokenizer, PretrainedConfig, PreTrainedTokenizerBase 6 | 7 | 8 | @contextmanager 9 | def patch_auto_tokenizer(tokenizer: PreTrainedTokenizerBase): 10 | _old_from_pretrained = AutoTokenizer.from_pretrained 11 | 12 | @wraps(_old_from_pretrained) 13 | def _from_pretrained(*args, **kwargs): 14 | return tokenizer 15 | 16 | AutoTokenizer.from_pretrained = _from_pretrained 17 | try: 18 | yield 19 | finally: 20 | AutoTokenizer.from_pretrained = _old_from_pretrained 21 | 22 | 23 | @contextmanager 24 | def patch_auto_config(config: PretrainedConfig): 25 | _old_from_pretrained = AutoConfig.from_pretrained 26 | 27 | @wraps(_old_from_pretrained) 28 | def _from_pretrained(*args, **kwargs): 29 | return (config, {}) if 'return_unused_kwargs' in kwargs else config 30 | 31 | AutoConfig.from_pretrained = _from_pretrained 32 | try: 33 | yield 34 | finally: 35 | AutoConfig.from_pretrained = _old_from_pretrained 36 | -------------------------------------------------------------------------------- /swift/llm/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from . import model 3 | from .constant import LLMModelType, MLLMModelType, ModelType 4 | from .model_arch import MODEL_ARCH_MAPPING, ModelArch, ModelKeys, MultiModelKeys, get_model_arch, register_model_arch 5 | from .register import (MODEL_MAPPING, Model, ModelGroup, ModelMeta, fix_do_sample_warning, get_default_device_map, 6 | get_default_torch_dtype, get_matched_model_meta, get_model_info_meta, get_model_name, 7 | get_model_tokenizer, get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn, 8 | load_by_unsloth, register_model) 9 | from .utils import HfConfigFactory, ModelInfo, get_llm_model, git_clone_github, safe_snapshot_download 10 | -------------------------------------------------------------------------------- /swift/llm/model/model/__init__.py: -------------------------------------------------------------------------------- 1 | from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft, 2 | minicpm, minimax, mistral, mllm, moonshot, mplug, openbuddy, qwen, skywork, stepfun, telechat, valley, 3 | yi) 4 | -------------------------------------------------------------------------------- /swift/llm/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampling import sampling_main 2 | -------------------------------------------------------------------------------- /swift/llm/template/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from . import template 3 | from .base import MaxLengthError, Template 4 | from .constant import TemplateType 5 | from .grounding import draw_bbox 6 | from .register import TEMPLATE_MAPPING, get_template, get_template_meta, register_template 7 | from .template_inputs import InferRequest, TemplateInputs 8 | from .template_meta import TemplateMeta 9 | from .utils import Prompt, Word, split_str_parts_by 10 | from .vision_utils import load_file, load_image 11 | -------------------------------------------------------------------------------- /swift/llm/template/template/__init__.py: -------------------------------------------------------------------------------- 1 | from . import (deepseek, emu3, gemma, glm, idefics3, internlm, internvl, llama, llava, llm, megrez, microsoft, minicpm, 2 | minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen, stepfun, valley, yi) 3 | -------------------------------------------------------------------------------- /swift/llm/train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .pt import SwiftPt, pt_main 3 | from .rlhf import SwiftRLHF, rlhf_main 4 | from .sft import SwiftSft, sft_main 5 | from .tuner import get_multimodal_target_regex 6 | -------------------------------------------------------------------------------- /swift/llm/train/pt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from typing import List, Union 3 | 4 | from swift.utils import get_logger 5 | from ..argument import TrainArguments 6 | from .sft import SwiftSft 7 | 8 | logger = get_logger() 9 | 10 | 11 | class SwiftPt(SwiftSft): 12 | args_class = TrainArguments 13 | args: args_class 14 | 15 | def _prepare_template(self) -> None: 16 | self.args.use_chat_template = False 17 | self.args.loss_scale = 'all' 18 | logger.info('Setting args.use_chat_template: False') 19 | logger.info("Setting args.loss_scale: 'all'") 20 | super()._prepare_template() 21 | 22 | 23 | def pt_main(args: Union[List[str], TrainArguments, None] = None): 24 | return SwiftPt(args).main() 25 | -------------------------------------------------------------------------------- /swift/megatron/argument/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .megatron_args import MegatronArguments 3 | from .train_args import MegatronTrainArguments 4 | -------------------------------------------------------------------------------- /swift/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from . import gpt 3 | from .constant import MegatronModelType 4 | from .register import MegatronModelMeta, get_megatron_model_meta, register_megatron_model 5 | -------------------------------------------------------------------------------- /swift/megatron/model/constant.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | class MegatronModelType: 3 | gpt = 'gpt' 4 | -------------------------------------------------------------------------------- /swift/megatron/model/gpt/config.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from ..config import convert_hf_config 4 | 5 | 6 | def convert_gpt_hf_config(config) -> Dict[str, Any]: 7 | res = convert_hf_config(config) 8 | model_type = res.get('model_type') 9 | if model_type in {'qwen3', 'qwen3_moe'}: 10 | res['qk_layernorm'] = True 11 | if model_type in {'qwen2_moe', 'qwen3_moe'}: 12 | res.pop('ffn_hidden_size', None) 13 | return res 14 | -------------------------------------------------------------------------------- /swift/megatron/train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .pt import megatron_pt_main 3 | from .sft import megatron_sft_main 4 | -------------------------------------------------------------------------------- /swift/megatron/train/patcher.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from contextlib import contextmanager 3 | 4 | from megatron.training import get_args, global_vars, initialize, training 5 | 6 | 7 | @contextmanager 8 | def patch_megatron_data_collator(data_collator): 9 | origin_build_pretraining_data_loader = training.build_pretraining_data_loader 10 | 11 | def build_pretraining_data_loader(*_args, **kwargs): 12 | args = get_args() 13 | res = origin_build_pretraining_data_loader(*_args, **kwargs) 14 | if res is not None and args.dataloader_type != 'external': 15 | res.collate_fn = data_collator 16 | return res 17 | 18 | training.build_pretraining_data_loader = build_pretraining_data_loader 19 | try: 20 | yield 21 | finally: 22 | training.build_pretraining_data_loader = origin_build_pretraining_data_loader 23 | -------------------------------------------------------------------------------- /swift/megatron/train/pt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from typing import List, Union 3 | 4 | from swift.utils import get_logger 5 | from ..argument import MegatronTrainArguments 6 | from .sft import MegatronSft 7 | 8 | logger = get_logger() 9 | 10 | 11 | class MegatronPt(MegatronSft): 12 | args_class = MegatronTrainArguments 13 | args: args_class 14 | 15 | def _prepare_template(self) -> None: 16 | self.args.use_chat_template = False 17 | self.args.loss_scale = 'all' 18 | logger.info('Setting args.use_chat_template: False') 19 | logger.info("Setting args.loss_scale: 'all'") 20 | super()._prepare_template() 21 | 22 | 23 | def megatron_pt_main(args: Union[List[str], MegatronTrainArguments, None] = None): 24 | return MegatronPt(args).main() 25 | -------------------------------------------------------------------------------- /swift/megatron/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | 3 | from .convert import convert_hf2mcore, convert_mcore2hf 4 | from .patcher import patch_megatron_tokenizer 5 | -------------------------------------------------------------------------------- /swift/megatron/utils/patcher.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy 3 | from megatron.training import get_args, global_vars, initialize, training 4 | 5 | from swift.utils import get_logger 6 | 7 | logger = get_logger() 8 | 9 | 10 | def patch_megatron_tokenizer(tokenizer): 11 | 12 | def build_tokenizer(args): 13 | return tokenizer 14 | 15 | global_vars.build_tokenizer = build_tokenizer 16 | 17 | 18 | def patch_torch_dist_shard(thread_count): 19 | __init__ = TorchDistSaveShardedStrategy.__init__ 20 | 21 | def __new_init__(*args, **kwargs): 22 | kwargs['thread_count'] = thread_count 23 | return __init__(*args, **kwargs) 24 | 25 | TorchDistSaveShardedStrategy.__init__ = __new_init__ 26 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/__init__.py: -------------------------------------------------------------------------------- 1 | from .loss_scale import loss_scale_map 2 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/config/agentflan.json: -------------------------------------------------------------------------------- 1 | { 2 | "response":{ 3 | "Name:": [1.0, 3.0], 4 | "Action:": [1.0, 3.0], 5 | "ACTION:": [1.0,3.0], 6 | "Tool:": [1.0, 3.0], 7 | "Command": [1.0, 3.0], 8 | "Arguments:": [1.0, 3.0], 9 | "action input": [1.0, 3.0], 10 | "ACTION_INPUT:":[1.0, 3.0], 11 | "Action Input:": [1.0, 3.0], 12 | "Thought:": [1.0, 1.0], 13 | "Final Answer:": [1.0, 1.0], 14 | "Observation:": [2.0, 0.0] 15 | }, 16 | "query":{ 17 | "What is the tool you want to use": [3.0], 18 | "What are the required parameter names": [3.0], 19 | "What is the value of": [3.0], 20 | "What are the required parameter names for this tool": [3.0] 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/config/alpha_umi.json: -------------------------------------------------------------------------------- 1 | { 2 | "Action:": [2.0, 2.0], 3 | "Action Input:": [2.0, 2.0], 4 | "Thought:": [1.0, 1.0], 5 | "Final Answer:": [1.0, 1.0], 6 | "Observation:": [2.0, 0.0], 7 | "Next:": [2,0, 2.0] 8 | } 9 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/config/hermes.json: -------------------------------------------------------------------------------- 1 | { 2 | ".+?": [2.0] 3 | } 4 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/config/ignore_empty_think.json: -------------------------------------------------------------------------------- 1 | { 2 | "\\s*\\s*": [0.0] 3 | } 4 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/config/qwen.json: -------------------------------------------------------------------------------- 1 | { 2 | "✿FUNCTION✿:": [2.0, 2.0], 3 | "✿ARGS✿:": [2.0, 2.0], 4 | "✿RETURN✿:": [1.0, 1.0], 5 | "✿RESULT✿:": [2.0, 0.0] 6 | } 7 | -------------------------------------------------------------------------------- /swift/plugin/loss_scale/config/react.json: -------------------------------------------------------------------------------- 1 | { 2 | "Action:": [2.0, 2.0], 3 | "Action Input:": [2.0, 2.0], 4 | "Thought:": [1.0, 1.0], 5 | "Final Answer:": [1.0, 1.0], 6 | "Observation:": [2.0, 0.0] 7 | } 8 | -------------------------------------------------------------------------------- /swift/trainers/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /swift/trainers/optimizers/galore/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from swift.utils.import_utils import _LazyModule 6 | 7 | if TYPE_CHECKING: 8 | from .utils import create_optimizer_and_scheduler, GaLoreConfig 9 | from .adafactor import GaLoreAdafactor 10 | from .adamw8bit import GaLoreAdamW8bit 11 | from .adamw import GaLoreAdamW 12 | else: 13 | _import_structure = { 14 | 'utils': ['GaLoreConfig', 'create_optimizer_and_scheduler'], 15 | 'adafactor': ['GaLoreAdafactor'], 16 | 'adamw8bit': ['GaLoreAdamW8bit'], 17 | 'adamw': ['GaLoreAdamW'], 18 | } 19 | 20 | import sys 21 | 22 | sys.modules[__name__] = _LazyModule( 23 | __name__, 24 | globals()['__file__'], 25 | _import_structure, 26 | module_spec=__spec__, 27 | extra_objects={}, 28 | ) 29 | -------------------------------------------------------------------------------- /swift/trainers/rlhf_trainer/orpo_trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from typing import Optional, Union 3 | 4 | import torch.nn as nn 5 | from transformers import PreTrainedModel 6 | from trl import ORPOTrainer as HFORPOTrainer 7 | 8 | from ..mixin import SwiftMixin 9 | from .rlhf_mixin import RLHFTrainerMixin 10 | 11 | del HFORPOTrainer.__init__ 12 | 13 | 14 | class ORPOTrainer(RLHFTrainerMixin, SwiftMixin, HFORPOTrainer): 15 | 16 | def __init__(self, model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, *_args, **kwargs): 17 | ref_model = kwargs.get('ref_model') 18 | assert ref_model is None, 'ORPO does not require a ref_model.' 19 | super().__init__(model, *_args, **kwargs) 20 | -------------------------------------------------------------------------------- /swift/trainers/sequence_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | if os.environ.get('SEQUENCE_PARALLEL_IMPL', 'ulysses') == 'xtuner': 4 | from .xtuner import XTuner 5 | sequence_parallel = XTuner() 6 | else: 7 | from .ulysses import Ulysses 8 | sequence_parallel = Ulysses() 9 | -------------------------------------------------------------------------------- /swift/trainers/sequence_parallel/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import abstractmethod 3 | 4 | 5 | class SequenceParallel(abc.ABC): 6 | 7 | @abstractmethod 8 | def init_sequence_parallel(self, size): 9 | pass 10 | 11 | @abstractmethod 12 | def prepare_model(self, model, tokenizer): 13 | pass 14 | 15 | @abstractmethod 16 | def pad_and_split_inputs(self, 17 | input_ids, 18 | input_embeds, 19 | labels, 20 | position_ids, 21 | attention_mask, 22 | loss_scale, 23 | embed_tokens=None): 24 | pass 25 | 26 | @abstractmethod 27 | def reduce_outputs(self, loss, labels): 28 | pass 29 | 30 | @property 31 | def sp_group(self): 32 | return None 33 | 34 | @abstractmethod 35 | def world_size(self): 36 | pass 37 | 38 | @abstractmethod 39 | def prepare_trainer(self, trainer): 40 | pass 41 | 42 | @abstractmethod 43 | def get_dataloader(self, trainer, dataset, batch_size): 44 | pass 45 | -------------------------------------------------------------------------------- /swift/tuners/longlora/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /swift/tuners/scetuning/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .scetuning import SCETuning, SCETuningConfig 3 | -------------------------------------------------------------------------------- /swift/ui/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from .app import webui_main 3 | -------------------------------------------------------------------------------- /swift/ui/llm_eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /swift/ui/llm_export/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /swift/ui/llm_grpo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/swift/ui/llm_grpo/__init__.py -------------------------------------------------------------------------------- /swift/ui/llm_grpo/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | from swift.ui.llm_train.model import Model as TrainModel 3 | 4 | 5 | class Model(TrainModel): 6 | group = 'llm_grpo' 7 | -------------------------------------------------------------------------------- /swift/ui/llm_infer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /swift/ui/llm_train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /swift/utils/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba, Inc. and its affiliates. 2 | 3 | BIN_EXTENSIONS = [ 4 | '.*.bin', 5 | '.*.ts', 6 | '.*.pt', 7 | '.*.data-00000-of-00001', 8 | '.*.onnx', 9 | '.*.meta', 10 | '.*.pb', 11 | '.*.index', 12 | ] 13 | 14 | PEFT_TYPE_KEY = 'peft_type' 15 | SWIFT_TYPE_KEY = 'swift_type' 16 | DEFAULT_ADAPTER = 'default' 17 | 18 | 19 | class Invoke(object): 20 | KEY = 'invoked_by' 21 | THIRD_PARTY = 'third_party' 22 | PRETRAINED = 'from_pretrained' 23 | PIPELINE = 'pipeline' 24 | TRAINER = 'trainer' 25 | LOCAL_TRAINER = 'local_trainer' 26 | PREPROCESSOR = 'preprocessor' 27 | SWIFT = 'swift' 28 | -------------------------------------------------------------------------------- /swift/version.py: -------------------------------------------------------------------------------- 1 | # Make sure to modify __release_datetime__ to release time when making official release. 2 | __version__ = '3.5.0.dev0' 3 | # default release datetime for branches under active development is set 4 | # to be a time far-far-away-into-the-future 5 | __release_datetime__ = '2099-10-13 08:56:12' 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/__init__.py -------------------------------------------------------------------------------- /tests/app/test_app.py: -------------------------------------------------------------------------------- 1 | def test_llm(): 2 | from swift.llm import app_main, AppArguments 3 | app_main(AppArguments(model='Qwen/Qwen2.5-0.5B-Instruct')) 4 | 5 | 6 | def test_lora(): 7 | from swift.llm import app_main, AppArguments 8 | app_main(AppArguments(adapters='swift/test_lora', lang='en', studio_title='小黄')) 9 | 10 | 11 | def test_mllm(): 12 | from swift.llm import app_main, AppArguments 13 | app_main(AppArguments(model='Qwen/Qwen2-VL-7B-Instruct', stream=True)) 14 | 15 | 16 | def test_audio(): 17 | from swift.llm import AppArguments, app_main, DeployArguments, run_deploy 18 | deploy_args = DeployArguments(model='Qwen/Qwen2-Audio-7B-Instruct', infer_backend='pt', verbose=False) 19 | 20 | with run_deploy(deploy_args, return_url=True) as url: 21 | app_main(AppArguments(model='Qwen2-Audio-7B-Instruct', base_url=url, stream=True)) 22 | 23 | 24 | if __name__ == '__main__': 25 | test_mllm() 26 | -------------------------------------------------------------------------------- /tests/general/test_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | from swift.utils import get_device 6 | 7 | os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' 8 | 9 | 10 | def test_qwen2(): 11 | import os 12 | from swift.llm import get_model_tokenizer 13 | model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', load_model=False) 14 | print(f'model: {model}, tokenizer: {tokenizer}') 15 | # test hf 16 | model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', load_model=False, use_hf=True) 17 | 18 | model, tokenizer = get_model_tokenizer( 19 | 'Qwen/Qwen2-7B-Instruct', torch.float32, device_map=get_device(), attn_impl='flash_attn') 20 | print(f'model: {model}, tokenizer: {tokenizer}') 21 | 22 | 23 | def test_modelscope_hub(): 24 | from swift.llm import get_model_tokenizer 25 | model, tokenizer = get_model_tokenizer('Qwen/Qwen2___5-Math-1___5B-Instruct/', load_model=False) 26 | 27 | 28 | if __name__ == '__main__': 29 | test_qwen2() 30 | # test_modelscope_hub() 31 | -------------------------------------------------------------------------------- /tests/general/test_stream.py: -------------------------------------------------------------------------------- 1 | from swift.llm import load_dataset 2 | 3 | 4 | def test_local_dataset(): 5 | # please use git clone 6 | from swift.llm import git_clone_github 7 | model_dir = git_clone_github('https://www.modelscope.cn/datasets/swift/swift-sft-mixture.git') 8 | dataset = load_dataset(datasets=[f'{model_dir}:firefly'], streaming=True)[0] 9 | print(next(iter(dataset))) 10 | 11 | 12 | def test_hub_dataset(): 13 | local_dataset = 'swift/swift-sft-mixture:firefly' 14 | dataset = load_dataset(datasets=[local_dataset], streaming=True)[0] 15 | print(next(iter(dataset))) 16 | 17 | 18 | if __name__ == '__main__': 19 | test_local_dataset() 20 | # test_hub_dataset() 21 | -------------------------------------------------------------------------------- /tests/hub/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/hub/__init__.py -------------------------------------------------------------------------------- /tests/hub/test_check_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | from modelscope import Model, check_local_model_is_latest 7 | 8 | 9 | class TestCheckModel(unittest.TestCase): 10 | 11 | def setUp(self): 12 | print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) 13 | self.tmp_dir = tempfile.TemporaryDirectory().name 14 | if not os.path.exists(self.tmp_dir): 15 | os.makedirs(self.tmp_dir) 16 | 17 | def tearDown(self): 18 | import peft 19 | shutil.rmtree(self.tmp_dir) 20 | super().tearDown() 21 | 22 | def test_check_model(self): 23 | model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base', revision='v1.0.0') 24 | self.assertFalse(check_local_model_is_latest(model.model_dir)) 25 | -------------------------------------------------------------------------------- /tests/infer/test_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | 7 | kwargs = { 8 | 'per_device_train_batch_size': 2, 9 | 'save_steps': 50, 10 | 'gradient_accumulation_steps': 4, 11 | 'num_train_epochs': 1, 12 | } 13 | 14 | 15 | def test_sft(): 16 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' 17 | from swift.llm import sft_main, TrainArguments, infer_main, InferArguments 18 | sft_main( 19 | TrainArguments(model='Qwen/Qwen2-7B-Instruct', dataset=['iic/ms_agent#2000'], loss_scale='react', **kwargs)) 20 | 21 | 22 | def test_infer(): 23 | from swift.llm import infer_main, InferArguments 24 | ckpt_dir = 'output/Qwen2-7B-Instruct/v229-20241126-133152/checkpoint-100' 25 | infer_main(InferArguments(ckpt_dir=ckpt_dir)) 26 | 27 | 28 | if __name__ == '__main__': 29 | test_sft() 30 | # test_infer() 31 | -------------------------------------------------------------------------------- /tests/infer/test_max_memory.py: -------------------------------------------------------------------------------- 1 | from swift.llm import InferArguments, infer_main 2 | 3 | 4 | def test_max_memory(): 5 | infer_main( 6 | InferArguments(model='Qwen/Qwen2.5-7B-Instruct', max_memory='{0: "50GB", 1: "5GB"}', device_map='sequential')) 7 | 8 | 9 | if __name__ == '__main__': 10 | test_max_memory() 11 | -------------------------------------------------------------------------------- /tests/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/llm/__init__.py -------------------------------------------------------------------------------- /tests/llm/config/infer.json: -------------------------------------------------------------------------------- 1 | { 2 | "ckpt_dir": "/mnt/workspace/yzhao/modelscope/swift/output/pai_test/checkpoint-6", 3 | "val_dataset_sample": 2, 4 | "load_dataset_config": true 5 | } 6 | -------------------------------------------------------------------------------- /tests/llm/config/sft.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "qwen-1_8b-chat", 3 | "dataset": "jd-sentiment-zh", 4 | "output_dir": "output/pai_test", 5 | "train_dataset_sample": 100, 6 | "eval_steps": 5 7 | } 8 | -------------------------------------------------------------------------------- /tests/llm/data/alpaca.csv: -------------------------------------------------------------------------------- 1 | system,instruction,input,output 2 | 00000,11111,22222,3.3 3 | ,aaaaa,,ccccc 4 | ,AAAAA,BBBBB,CCCCC 5 | -------------------------------------------------------------------------------- /tests/llm/data/alpaca.jsonl: -------------------------------------------------------------------------------- 1 | {"instruction": "11111", "input": "22222", "output": "33333", "history": [["aaaaa", "bbbbb"]], "system": "system123"} 2 | {"instruction": "aaaaa", "output": "ccccc"} 3 | {"instruction": "AAAAA", "input": "BBBBB", "output": "CCCCC"} 4 | -------------------------------------------------------------------------------- /tests/llm/data/alpaca2.csv: -------------------------------------------------------------------------------- 1 | instruction,output 2 | 11111,33333 3 | aaaaa,ccccc 4 | AAAAA,CCCCC 5 | -------------------------------------------------------------------------------- /tests/llm/data/chatml.jsonl: -------------------------------------------------------------------------------- 1 | {"messages": [{"role": "system", "content": "00000"}, {"role": "user", "content": "11111"}, {"role": "assistant", "content": "22222"}]} 2 | {"messages": [{"role": "user", "content": "aaaaa"}, {"role": "assistant", "content": "bbbbb"}, {"role": "user", "content": "ccccc"}, {"role": "assistant", "content": "ddddd"}]} 3 | {"messages": [{"role": "user", "content": "AAAAA"}, {"role": "assistant", "content": "BBBBB"}, {"role": "user", "content": "CCCCC"}, {"role": "assistant", "content": "DDDDD"}]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/conversations.jsonl: -------------------------------------------------------------------------------- 1 | {"conversations": [{"from": "system", "value": "00000"}, {"from": "user", "value": "11111"}, {"from": "assistant", "value": "22222"}]} 2 | {"conversations": [{"from": "user", "value": "aaaaa"}, {"from": "assistant", "value": "bbbbb"}, {"from": "user", "value": "ccccc"}, {"from": "assistant", "value": "ddddd"}]} 3 | {"conversations": [{"from": "user", "value": "AAAAA"}, {"from": "assistant", "value": "BBBBB"}, {"from": "user", "value": "CCCCC"}, {"from": "assistant", "value": "DDDDD"}]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/multi_modal_1.jsonl: -------------------------------------------------------------------------------- 1 | {"query": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg55555", "response": "66666"} 2 | {"query": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeghttps://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpegeeeee", "response": "fffff", "history": [["hello", "123"]]} 3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/multi_modal_2.jsonl: -------------------------------------------------------------------------------- 1 | {"query": "55555", "response": "66666", "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]} 2 | {"query": "eeeee", "response": "fffff", "history": [], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]} 3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/multi_modal_3.jsonl: -------------------------------------------------------------------------------- 1 | {"query": "55555", "response": "66666", "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]} 2 | {"query": "eeeee", "response": "fffff", "history": [], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]} 3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/sharegpt.jsonl: -------------------------------------------------------------------------------- 1 | {"system": "00000", "conversation": [{"human": "11111", "assistant": "22222"}]} 2 | {"conversation": [{"human": "aaaaa", "assistant": "bbbbb"}]} 3 | {"conversation": [{"human": "AAAAA", "assistant": "BBBBB"}, {"human": "CCCCC", "assistant": "DDDDD"}, {"human": "EEEEE", "assistant": "FFFFF"}]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/swift_multi.json: -------------------------------------------------------------------------------- 1 | [{"system": "00000", "query": "55555", "response": "66666"}, 2 | {"query": "eeeee", "response": "fffff", "history": []}, 3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]}] 4 | -------------------------------------------------------------------------------- /tests/llm/data/swift_multi.jsonl: -------------------------------------------------------------------------------- 1 | {"system": "00000", "query": "55555", "response": "66666"} 2 | {"query": "eeeee", "response": "fffff", "history": []} 3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]} 4 | -------------------------------------------------------------------------------- /tests/llm/data/swift_pre.csv: -------------------------------------------------------------------------------- 1 | response 2 | 11111 3 | aaaaa 4 | AAAAA 5 | -------------------------------------------------------------------------------- /tests/llm/data/swift_pre.jsonl: -------------------------------------------------------------------------------- 1 | {"response": "11111"} 2 | {"response": "aaaaa"} 3 | {"response": "AAAAA"} 4 | -------------------------------------------------------------------------------- /tests/llm/data/swift_single.csv: -------------------------------------------------------------------------------- 1 | system,query,response 2 | 00000,11111,22222 3 | ,aaaaa,bbbbb 4 | ,AAAAA,BBBBB 5 | -------------------------------------------------------------------------------- /tests/llm/data/swift_single.jsonl: -------------------------------------------------------------------------------- 1 | {"system": "00000", "query": "11111", "response": "22222"} 2 | {"query": "aaaaa", "response": "bbbbb"} 3 | {"query": "AAAAA", "response": "BBBBB"} 4 | -------------------------------------------------------------------------------- /tests/llm/test_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from swift.llm import load_dataset 4 | 5 | 6 | class TestDataset(unittest.TestCase): 7 | 8 | def test_load_v_dataset(self): 9 | if not __name__ == '__main__': 10 | # ignore citest error in github 11 | return 12 | 13 | for ds in ['m3it#1000', 'mantis-instruct#1000', 'llava-med-zh-instruct#1000']: 14 | ds = load_dataset(ds) 15 | assert len(ds[0]) > 800 16 | 17 | 18 | if __name__ == '__main__': 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /tests/llm/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from swift.llm import load_dataset 4 | from swift.utils import lower_bound 5 | 6 | 7 | class TestLlmUtils(unittest.TestCase): 8 | 9 | def test_count_startswith(self): 10 | arr = [-100] * 1000 + list(range(1000)) 11 | self.assertTrue(lower_bound(0, len(arr), lambda i: arr[i] != -100) == 1000) 12 | 13 | def test_count_endswith(self): 14 | arr = list(range(1000)) + [-100] * 1000 15 | self.assertTrue(lower_bound(0, len(arr), lambda i: arr[i] == -100) == 1000) 16 | 17 | @unittest.skip('avoid ci error') 18 | def test_dataset(self): 19 | dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#200'], 20 | num_proc=4, 21 | strict=False, 22 | download_mode='force_redownload') 23 | print(f'dataset[0]: {dataset[0]}') 24 | print(f'dataset[1]: {dataset[1]}') 25 | 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /tests/models/test_flash_attn.py: -------------------------------------------------------------------------------- 1 | from swift.llm import get_model_tokenizer 2 | 3 | if __name__ == '__main__': 4 | # model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', attn_impl='flash_attn') 5 | # model, tokenizer = get_model_tokenizer('AIDC-AI/Ovis2-2B', attn_impl='flash_attn') 6 | # model, tokenizer = get_model_tokenizer('OpenGVLab/InternVL2-2B', attn_impl='flash_attn') 7 | model, tokenizer = get_model_tokenizer('Shanghai_AI_Laboratory/internlm3-8b-instruct', attn_impl='flash_attn') 8 | print(model) 9 | -------------------------------------------------------------------------------- /tests/models/test_llm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' 4 | 5 | 6 | def test_llama3(): 7 | from swift.llm import infer_main, InferArguments 8 | infer_main( 9 | InferArguments( 10 | model='LLM-Research/Meta-Llama-3.1-8B-Instruct', 11 | max_batch_size=2, 12 | val_dataset='AI-ModelScope/alpaca-gpt4-data-en#2')) 13 | 14 | 15 | if __name__ == '__main__': 16 | test_llama3() 17 | -------------------------------------------------------------------------------- /tests/models/test_mllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 4 | 5 | 6 | def test_cogvlm(): 7 | from swift.llm import infer_main, InferArguments, sft_main, TrainArguments 8 | # infer_main(InferArguments(model='ZhipuAI/cogvlm2-video-llama3-chat')) 9 | sft_main( 10 | TrainArguments( 11 | model='ZhipuAI/cogvlm2-video-llama3-chat', 12 | dataset=['AI-ModelScope/alpaca-gpt4-data-zh#200', 'swift/VideoChatGPT:Generic#200'])) 13 | 14 | 15 | if __name__ == '__main__': 16 | test_cogvlm() 17 | -------------------------------------------------------------------------------- /tests/run_config.yaml: -------------------------------------------------------------------------------- 1 | # isolate cases in env, we can install different dependencies in each env. 2 | isolated: # test cases that may require excessive amount of GPU memory or run long time, which will be executed in dedicated process. 3 | 4 | envs: 5 | default: # default env, case not in other env will in default, pytorch. 6 | dependencies: # requirement packages,pip install before test case run. 7 | - numpy>=1.20,<=1.22.0 8 | - protobuf<4,>=3.20.2 9 | -------------------------------------------------------------------------------- /tests/test_align/test_rlhf_loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/test_align/test_rlhf_loss.py -------------------------------------------------------------------------------- /tests/test_align/test_template/test_gene.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | os.environ['SWIFT_DEBUG'] = '1' 7 | 8 | 9 | def test_deepseek_janus_pro_gene(): 10 | from swift.llm import infer_main, InferArguments 11 | args = InferArguments(model='deepseek-ai/Janus-Pro-1B', infer_backend='pt') 12 | infer_main(args) 13 | 14 | 15 | def test_emu3_gen(infer_backend): 16 | from swift.llm import infer_main, InferArguments 17 | args = InferArguments( 18 | model='BAAI/Emu3-Gen', 19 | infer_backend=infer_backend, 20 | stream=False, 21 | use_chat_template=False, 22 | top_k=2048, 23 | max_new_tokens=40960) 24 | infer_main(args) 25 | 26 | 27 | if __name__ == '__main__': 28 | # test_emu3_gen('pt') 29 | test_deepseek_janus_pro_gene() 30 | -------------------------------------------------------------------------------- /tests/train/test_grounding.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from swift.llm import TrainArguments, sft_main 4 | 5 | os.environ['MAX_PIXELS'] = str(16 * 28 * 28) 6 | 7 | if __name__ == '__main__': 8 | sft_main(TrainArguments(model='Qwen/Qwen2.5-VL-7B-Instruct', dataset='AI-ModelScope/coco#2000')) 9 | -------------------------------------------------------------------------------- /tests/train/test_sample.py: -------------------------------------------------------------------------------- 1 | from swift.llm import SamplingArguments, sampling_main 2 | 3 | 4 | def test_sampling(): 5 | sampling_main( 6 | SamplingArguments( 7 | model='LLM-Research/Meta-Llama-3.1-8B-Instruct', 8 | sampler_engine='pt', 9 | num_return_sequences=5, 10 | dataset='AI-ModelScope/alpaca-gpt4-data-zh#5')) 11 | 12 | 13 | if __name__ == '__main__': 14 | test_sampling() 15 | -------------------------------------------------------------------------------- /tests/train/test_train_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | kwargs = { 4 | 'per_device_train_batch_size': 5, 5 | 'save_steps': 5, 6 | 'gradient_accumulation_steps': 1, 7 | 'num_train_epochs': 1, 8 | } 9 | 10 | 11 | def test_train_eval_loop(): 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,2' 13 | from swift.llm import sft_main, TrainArguments 14 | sft_main( 15 | TrainArguments( 16 | model='Qwen/Qwen2.5-0.5B-Instruct', 17 | dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100'], 18 | target_modules=['all-linear', 'all-embedding'], 19 | modules_to_save=['all-embedding', 'all-norm'], 20 | eval_strategy='steps', 21 | eval_steps=5, 22 | per_device_eval_batch_size=5, 23 | eval_use_evalscope=True, 24 | eval_datasets=['gsm8k'], 25 | eval_datasets_args={'gsm8k': { 26 | 'few_shot_num': 0 27 | }}, 28 | eval_limit=10, 29 | report_to=['wandb'], 30 | **kwargs)) 31 | 32 | 33 | if __name__ == '__main__': 34 | test_train_eval_loop() 35 | -------------------------------------------------------------------------------- /tests/train/test_vit_lr.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 4 | 5 | 6 | def test_vit_lr(): 7 | # https://github.com/QwenLM/Qwen2.5-VL/tree/main/qwen-vl-finetune 8 | from swift.llm import sft_main, TrainArguments 9 | sft_main( 10 | TrainArguments( 11 | model='Qwen/Qwen2.5-VL-7B-Instruct', 12 | dataset=['AI-ModelScope/LaTeX_OCR#20000'], 13 | vit_lr=2e-5, 14 | learning_rate=1e-5, 15 | aligner_lr=1e-4, 16 | freeze_llm=False, 17 | freeze_vit=False, 18 | freeze_aligner=False)) 19 | 20 | 21 | if __name__ == '__main__': 22 | test_vit_lr() 23 | -------------------------------------------------------------------------------- /tests/tuners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/tuners/__init__.py -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_split_str_parts_by.py: -------------------------------------------------------------------------------- 1 | from swift.llm.template import split_str_parts_by 2 | 3 | 4 | def test_split_str_parts_by(): 5 | print(split_str_parts_by('aaaAction:bb\nbAction Inputs:\nabbb', ['Action:', 'Action Inputs:'], regex_mode=False)) 6 | print(split_str_parts_by('aaaAction:bb\nbAction Inputs:\nabbb', ['Action:', 'Action Inputs:'], regex_mode=True)) 7 | print(split_str_parts_by('aaabbbccc', ['.+?'], regex_mode=True)) 8 | print(split_str_parts_by('aaa\nbb\nb