├── .dev_scripts
    ├── build_docs.sh
    ├── ci_container_test.sh
    └── dockerci.sh
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── SECURITY.md
    └── workflows
    │   ├── citest.yaml
    │   ├── lint.yaml
    │   └── publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .pre-commit-config_local.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTING_CN.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── README_CN.md
├── asset
    ├── banner.png
    ├── discord_qr.jpg
    └── wechat.png
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── resources
    │   ├── dpo_data.png
    │   ├── grpo.png
    │   ├── grpo_clevr_count.png
    │   ├── grpo_code.png
    │   ├── grpo_countdown.png
    │   ├── grpo_countdown_1.png
    │   ├── grpo_geoqa.png
    │   ├── grpo_openr1_multimodal.png
    │   ├── kto_data.png
    │   ├── web-ui-en.jpg
    │   └── web-ui.jpg
    ├── source
    │   ├── .readthedocs.yaml
    │   ├── BestPractices
    │   │   ├── Embedding训练.md
    │   │   ├── GRPO代码训练.md
    │   │   ├── GRPO多模态训练.md
    │   │   ├── GRPO完整流程.md
    │   │   ├── NPU支持.md
    │   │   ├── 快速训练VL模型.md
    │   │   └── 更多最佳实践.md
    │   ├── Customization
    │   │   ├── 插件化.md
    │   │   ├── 自定义数据集.md
    │   │   └── 自定义模型.md
    │   ├── GetStarted
    │   │   ├── SWIFT安装.md
    │   │   ├── Web-UI.md
    │   │   └── 快速开始.md
    │   ├── Instruction
    │   │   ├── Agent支持.md
    │   │   ├── GRPO.md
    │   │   ├── Megatron-SWIFT训练.md
    │   │   ├── ReleaseNote3.0.md
    │   │   ├── 人类对齐.md
    │   │   ├── 使用tuners.md
    │   │   ├── 命令行参数.md
    │   │   ├── 导出与推送.md
    │   │   ├── 常见问题整理.md
    │   │   ├── 强化微调.md
    │   │   ├── 推理和部署.md
    │   │   ├── 支持的模型和数据集.md
    │   │   ├── 评测.md
    │   │   ├── 采样.md
    │   │   └── 预训练与微调.md
    │   ├── _templates
    │   │   ├── autosummary
    │   │   │   └── class.rst
    │   │   ├── classtemplate.rst
    │   │   └── sobolengine.rst
    │   ├── conf.py
    │   └── index.rst
    └── source_en
    │   ├── .readthedocs.yaml
    │   ├── BestPractices
    │       ├── Embedding.md
    │       ├── GRPO-Code-Training.md
    │       ├── GRPO-Multi-Modal-Training.md
    │       ├── GRPO.md
    │       ├── More-Best-Practices.md
    │       ├── NPU-support.md
    │       └── Rapidly-Training-VL-model.md
    │   ├── Customization
    │       ├── Custom-dataset.md
    │       ├── Custom-model.md
    │       └── Pluginization.md
    │   ├── GetStarted
    │       ├── Quick-start.md
    │       ├── SWIFT-installation.md
    │       └── Web-UI.md
    │   ├── Instruction
    │       ├── Agent-support.md
    │       ├── Command-line-parameters.md
    │       ├── Evaluation.md
    │       ├── Export-and-push.md
    │       ├── Frequently-asked-questions.md
    │       ├── GRPO.md
    │       ├── Inference-and-deployment.md
    │       ├── Megatron-SWIFT-Training.md
    │       ├── Pre-training-and-Fine-tuning.md
    │       ├── RLHF.md
    │       ├── Reinforced-Fine-tuning.md
    │       ├── ReleaseNote3.0.md
    │       ├── Sample.md
    │       ├── Supported-models-and-datasets.md
    │       └── Use-tuners.md
    │   ├── _templates
    │       ├── autosummary
    │       │   └── class.rst
    │       ├── classtemplate.rst
    │       └── sobolengine.rst
    │   ├── conf.py
    │   └── index.rst
├── examples
    ├── README.md
    ├── app
    │   ├── base_url
    │   │   ├── demo.py
    │   │   └── demo.sh
    │   ├── llm.sh
    │   └── mllm.sh
    ├── custom
    │   ├── dataset.py
    │   ├── infer.sh
    │   ├── model.py
    │   └── sft.sh
    ├── deploy
    │   ├── agent
    │   │   ├── client.py
    │   │   └── server.sh
    │   ├── bert
    │   │   ├── client.py
    │   │   └── server.sh
    │   ├── client
    │   │   ├── llm
    │   │   │   ├── base
    │   │   │   │   ├── openai_client.py
    │   │   │   │   └── swift_client.py
    │   │   │   └── chat
    │   │   │   │   ├── openai_client.py
    │   │   │   │   └── swift_client.py
    │   │   └── mllm
    │   │   │   ├── openai_client.py
    │   │   │   └── swift_client.py
    │   ├── lora
    │   │   ├── client.py
    │   │   └── server.sh
    │   ├── reward_model
    │   │   ├── client.py
    │   │   └── server.sh
    │   └── server
    │   │   ├── README.md
    │   │   └── demo.sh
    ├── eval
    │   ├── eval_url
    │   │   ├── demo.py
    │   │   └── eval.sh
    │   ├── llm
    │   │   └── eval.sh
    │   ├── train_eval
    │   │   └── train.sh
    │   └── vlm
    │   │   └── eval.sh
    ├── export
    │   ├── merge_lora.sh
    │   ├── ollama.sh
    │   ├── push_to_hub.sh
    │   └── quantize
    │   │   ├── awq.sh
    │   │   ├── bert
    │   │       ├── bnb.sh
    │   │       └── gptq.sh
    │   │   ├── bnb.sh
    │   │   ├── gptq.sh
    │   │   ├── mllm
    │   │       ├── awq.sh
    │   │       └── gptq.sh
    │   │   ├── moe
    │   │       ├── awq.sh
    │   │       └── gptq.sh
    │   │   ├── omni
    │   │       └── gptq.sh
    │   │   └── reward_model
    │   │       ├── bnb.sh
    │   │       └── gptq.sh
    ├── infer
    │   ├── cli_demo.sh
    │   ├── demo.py
    │   ├── demo_agent.py
    │   ├── demo_bert.py
    │   ├── demo_grounding.py
    │   ├── demo_hf.py
    │   ├── demo_lora.py
    │   ├── demo_mllm.py
    │   ├── demo_reward_model.py
    │   ├── lmdeploy
    │   │   └── mllm_tp.sh
    │   ├── pt
    │   │   ├── batch_ddp.sh
    │   │   ├── bert.sh
    │   │   ├── lora.sh
    │   │   ├── mllm_device_map.sh
    │   │   ├── prm.sh
    │   │   └── reward_model.sh
    │   └── vllm
    │   │   ├── dp_tp.sh
    │   │   ├── mllm_ddp.sh
    │   │   └── mllm_tp.sh
    ├── notebook
    │   ├── qwen2_5-self-cognition
    │   │   ├── infer.ipynb
    │   │   ├── infer.sh
    │   │   ├── self-cognition-sft.ipynb
    │   │   └── sft.sh
    │   ├── qwen2_5-vl-grounding
    │   │   └── zh.ipynb
    │   └── qwen2vl-ocr
    │   │   ├── infer.ipynb
    │   │   └── ocr-sft.ipynb
    ├── sampler
    │   ├── distill
    │   │   └── distill.sh
    │   └── mcts
    │   │   ├── mcts.py
    │   │   ├── mcts.sh
    │   │   └── system_prompt.txt
    └── train
    │   ├── agent
    │       ├── deepseek_r1.sh
    │       ├── glm4.sh
    │       ├── loss_scale
    │       │   ├── infer_lora.py
    │       │   └── train.sh
    │       └── qwen2_5.sh
    │   ├── all_to_all
    │       ├── infer.sh
    │       └── train.sh
    │   ├── base_to_chat
    │       ├── full.sh
    │       ├── lora.sh
    │       └── lora2.sh
    │   ├── embedding
    │       ├── train_gme.sh
    │       └── train_gte.sh
    │   ├── full
    │       ├── infer.sh
    │       ├── qwen2_5_32b.sh
    │       └── train.sh
    │   ├── grpo
    │       ├── external
    │       │   ├── README.md
    │       │   ├── agent.sh
    │       │   ├── grpo_32b_full.sh
    │       │   └── grpo_7b.sh
    │       ├── internal
    │       │   ├── README.md
    │       │   ├── full_lmdeploy.sh
    │       │   ├── pt.sh
    │       │   ├── vllm_72b_4gpu.sh
    │       │   ├── vllm_lora_qwenvl72b.sh
    │       │   ├── vllm_multi_round.sh
    │       │   └── vllm_vl7b.sh
    │       ├── multi_node
    │       │   ├── Qwen2_5_32B_full.sh
    │       │   ├── multi_node1.sh
    │       │   ├── multi_node2.sh
    │       │   └── train_dlc.sh
    │       ├── plugin
    │       │   ├── plugin.py
    │       │   ├── run_external_reward_func.sh
    │       │   └── run_external_reward_model.sh
    │       ├── prompt.txt
    │       └── qwen2_5_omni
    │       │   ├── grpo.sh
    │       │   └── infer.sh
    │   ├── infer.sh
    │   ├── lazy_tokenize
    │       └── train.sh
    │   ├── liger
    │       └── sft.sh
    │   ├── long_text
    │       ├── liger_kernel.sh
    │       ├── sequence_parallel.sh
    │       ├── sequence_parallel_512k.sh
    │       ├── sequence_parallel_dpo.sh
    │       └── sequence_parallel_grpo.sh
    │   ├── lora_sft.sh
    │   ├── megatron
    │       ├── base_to_chat.sh
    │       ├── benchmark
    │       │   └── deepspeed.sh
    │       ├── long_text.sh
    │       ├── moe.sh
    │       ├── multi-node
    │       │   ├── node1.sh
    │       │   └── node2.sh
    │       ├── pretrain.sh
    │       ├── qwen3_32b.sh
    │       ├── qwen3_moe.sh
    │       └── sft.sh
    │   ├── moe
    │       ├── llama4.sh
    │       └── qwen2_5_moe.sh
    │   ├── multi-gpu
    │       ├── ddp
    │       │   └── train.sh
    │       ├── ddp_device_map
    │       │   └── train.sh
    │       ├── deepspeed
    │       │   ├── train_zero2.sh
    │       │   └── train_zero3.sh
    │       ├── device_map
    │       │   └── train.sh
    │       └── fsdp_qlora
    │       │   ├── fsdp_offload.json
    │       │   └── train.sh
    │   ├── multi-node
    │       ├── accelerate
    │       │   ├── multi_node.yaml
    │       │   ├── train_node1.sh
    │       │   └── train_node2.sh
    │       ├── deepspeed
    │       │   ├── README.md
    │       │   ├── host.txt
    │       │   └── train.sh
    │       ├── dlc
    │       │   └── train.sh
    │       ├── swift
    │       │   ├── train_node1.sh
    │       │   └── train_node2.sh
    │       └── torchrun
    │       │   ├── train_node1.sh
    │       │   └── train_node2.sh
    │   ├── multimodal
    │       ├── audio.sh
    │       ├── caption.sh
    │       ├── grounding.sh
    │       ├── infer.sh
    │       ├── lora_llm_full_vit
    │       │   ├── custom_plugin.py
    │       │   ├── infer.sh
    │       │   ├── merge_lora.sh
    │       │   └── sft.sh
    │       ├── ocr.sh
    │       ├── omni
    │       │   ├── infer.sh
    │       │   └── sft.sh
    │       ├── rlhf
    │       │   ├── dpo
    │       │   │   ├── full.sh
    │       │   │   └── lora.sh
    │       │   └── kto.sh
    │       ├── video.sh
    │       └── vit_gradient_checkpointing.sh
    │   ├── optimizer
    │       └── muon.sh
    │   ├── packing
    │       ├── llm.sh
    │       ├── qwen2_5_omni.sh
    │       ├── qwen2_5_vl.sh
    │       └── streaming.sh
    │   ├── padding_free
    │       └── sft.sh
    │   ├── plugins
    │       ├── loss_scale.sh
    │       └── tuner_phi4_mm.sh
    │   ├── predict_with_generate
    │       └── train.sh
    │   ├── pretrain
    │       └── train.sh
    │   ├── qlora
    │       ├── awq.sh
    │       ├── bnb.sh
    │       ├── gptq.sh
    │       └── hqq.sh
    │   ├── rft
    │       ├── math.json
    │       └── rft.py
    │   ├── rlhf
    │       ├── README.md
    │       ├── cpo.sh
    │       ├── dpo
    │       │   ├── full.sh
    │       │   └── lora.sh
    │       ├── kto.sh
    │       ├── orpo.sh
    │       ├── ppo
    │       │   ├── full.sh
    │       │   └── lora.sh
    │       ├── rm.sh
    │       └── simpo.sh
    │   ├── seq_cls
    │       ├── bert
    │       │   ├── deploy.sh
    │       │   ├── infer.sh
    │       │   └── sft.sh
    │       ├── multi_label
    │       │   └── sft.sh
    │       ├── qwen2_5
    │       │   ├── deploy.sh
    │       │   ├── infer.sh
    │       │   └── sft.sh
    │       ├── qwen2_vl
    │       │   ├── infer.sh
    │       │   └── sft.sh
    │       └── regression
    │       │   ├── deploy.sh
    │       │   ├── infer.sh
    │       │   └── sft.sh
    │   ├── streaming
    │       └── train.sh
    │   ├── think_model
    │       ├── deepseek_r1.sh
    │       ├── qwen3_demo1.sh
    │       └── qwen3_demo2.sh
    │   └── tuners
    │       ├── adalora
    │           └── train.sh
    │       ├── adapter
    │           └── train.sh
    │       ├── boft
    │           └── train.sh
    │       ├── bone
    │           └── train.sh
    │       ├── dora
    │           └── train.sh
    │       ├── galore
    │           ├── train_galore.sh
    │           └── train_qgalore.sh
    │       ├── lisa
    │           └── train.sh
    │       ├── llamapro
    │           └── train.sh
    │       ├── longlora
    │           └── train.sh
    │       ├── lora-ga
    │           └── train.sh
    │       ├── lora
    │           └── train.sh
    │       ├── neftune
    │           └── train.sh
    │       ├── olora
    │           └── train.sh
    │       ├── pissa
    │           └── train.sh
    │       ├── qlora
    │           └── train.sh
    │       ├── reft
    │           └── train.sh
    │       └── unsloth
    │           └── train.sh
├── requirements.txt
├── requirements
    ├── docs.txt
    ├── eval.txt
    ├── framework.txt
    ├── install_all.sh
    ├── seq_parallel.txt
    ├── swanlab.txt
    └── tests.txt
├── scripts
    ├── benchmark
    │   ├── config
    │   │   └── tuner.json
    │   ├── exp.py
    │   ├── exp_utils.py
    │   └── generate_report.py
    └── utils
    │   ├── plot_loss.py
    │   ├── run_dataset_info.py
    │   ├── run_model_info.py
    │   ├── run_template.py
    │   └── test_link_valid.py
├── setup.cfg
├── setup.py
├── swift
    ├── __init__.py
    ├── cli
    │   ├── __init__.py
    │   ├── _megatron
    │   │   ├── __init__.py
    │   │   ├── main.py
    │   │   ├── pt.py
    │   │   └── sft.py
    │   ├── app.py
    │   ├── deploy.py
    │   ├── eval.py
    │   ├── export.py
    │   ├── infer.py
    │   ├── main.py
    │   ├── merge_lora.py
    │   ├── pt.py
    │   ├── rlhf.py
    │   ├── rollout.py
    │   ├── sample.py
    │   ├── sft.py
    │   └── web_ui.py
    ├── hub
    │   ├── __init__.py
    │   ├── constant.py
    │   └── hub.py
    ├── llm
    │   ├── __init__.py
    │   ├── app
    │   │   ├── __init__.py
    │   │   ├── app.py
    │   │   ├── build_ui.py
    │   │   └── locale.py
    │   ├── argument
    │   │   ├── __init__.py
    │   │   ├── app_args.py
    │   │   ├── base_args
    │   │   │   ├── __init__.py
    │   │   │   ├── base_args.py
    │   │   │   ├── data_args.py
    │   │   │   ├── generation_args.py
    │   │   │   ├── model_args.py
    │   │   │   ├── quant_args.py
    │   │   │   ├── template_args.py
    │   │   │   └── utils.py
    │   │   ├── deploy_args.py
    │   │   ├── eval_args.py
    │   │   ├── export_args.py
    │   │   ├── infer_args.py
    │   │   ├── merge_args.py
    │   │   ├── rlhf_args.py
    │   │   ├── sampling_args.py
    │   │   ├── train_args.py
    │   │   ├── tuner_args.py
    │   │   └── webui_args.py
    │   ├── base.py
    │   ├── data_loader.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── data
    │   │   │   └── dataset_info.json
    │   │   ├── dataset
    │   │   │   ├── __init__.py
    │   │   │   ├── llm.py
    │   │   │   └── mllm.py
    │   │   ├── loader.py
    │   │   ├── media.py
    │   │   ├── preprocessor
    │   │   │   ├── __init__.py
    │   │   │   ├── core.py
    │   │   │   └── extra.py
    │   │   ├── register.py
    │   │   └── utils.py
    │   ├── ds_config
    │   │   ├── zero0.json
    │   │   ├── zero1.json
    │   │   ├── zero2.json
    │   │   ├── zero2_offload.json
    │   │   ├── zero3.json
    │   │   └── zero3_offload.json
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── eval.py
    │   │   └── utils.py
    │   ├── export
    │   │   ├── __init__.py
    │   │   ├── export.py
    │   │   ├── merge_lora.py
    │   │   ├── ollama.py
    │   │   └── quant.py
    │   ├── infer
    │   │   ├── __init__.py
    │   │   ├── deploy.py
    │   │   ├── infer.py
    │   │   ├── infer_engine
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── grpo_vllm_engine.py
    │   │   │   ├── infer_client.py
    │   │   │   ├── infer_engine.py
    │   │   │   ├── lmdeploy_engine.py
    │   │   │   ├── patch.py
    │   │   │   ├── pt_engine.py
    │   │   │   ├── utils.py
    │   │   │   └── vllm_engine.py
    │   │   ├── protocol.py
    │   │   ├── rollout.py
    │   │   └── utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── constant.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   ├── baai.py
    │   │   │   ├── baichuan.py
    │   │   │   ├── bert.py
    │   │   │   ├── codefuse.py
    │   │   │   ├── deepseek.py
    │   │   │   ├── gemma.py
    │   │   │   ├── glm.py
    │   │   │   ├── internlm.py
    │   │   │   ├── llama.py
    │   │   │   ├── llava.py
    │   │   │   ├── llm.py
    │   │   │   ├── mamba.py
    │   │   │   ├── microsoft.py
    │   │   │   ├── minicpm.py
    │   │   │   ├── minimax.py
    │   │   │   ├── mistral.py
    │   │   │   ├── mllm.py
    │   │   │   ├── moonshot.py
    │   │   │   ├── mplug.py
    │   │   │   ├── openbuddy.py
    │   │   │   ├── qwen.py
    │   │   │   ├── skywork.py
    │   │   │   ├── stepfun.py
    │   │   │   ├── telechat.py
    │   │   │   ├── valley.py
    │   │   │   └── yi.py
    │   │   ├── model_arch.py
    │   │   ├── patcher.py
    │   │   ├── register.py
    │   │   └── utils.py
    │   ├── sampling
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── distill_sampler.py
    │   │   ├── mcts.py
    │   │   ├── sampling.py
    │   │   ├── utils.py
    │   │   └── vanilla_sampler.py
    │   ├── template
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── constant.py
    │   │   ├── grounding.py
    │   │   ├── register.py
    │   │   ├── template
    │   │   │   ├── __init__.py
    │   │   │   ├── deepseek.py
    │   │   │   ├── emu3.py
    │   │   │   ├── gemma.py
    │   │   │   ├── glm.py
    │   │   │   ├── idefics3.py
    │   │   │   ├── internlm.py
    │   │   │   ├── internvl.py
    │   │   │   ├── llama.py
    │   │   │   ├── llava.py
    │   │   │   ├── llm.py
    │   │   │   ├── megrez.py
    │   │   │   ├── microsoft.py
    │   │   │   ├── minicpm.py
    │   │   │   ├── minimax.py
    │   │   │   ├── mistral.py
    │   │   │   ├── molmo.py
    │   │   │   ├── moonshot.py
    │   │   │   ├── mplug.py
    │   │   │   ├── openbuddy.py
    │   │   │   ├── pixtral.py
    │   │   │   ├── qwen.py
    │   │   │   ├── stepfun.py
    │   │   │   ├── utils.py
    │   │   │   ├── valley.py
    │   │   │   └── yi.py
    │   │   ├── template_inputs.py
    │   │   ├── template_meta.py
    │   │   ├── utils.py
    │   │   └── vision_utils.py
    │   ├── train
    │   │   ├── __init__.py
    │   │   ├── callback.py
    │   │   ├── kto.py
    │   │   ├── pt.py
    │   │   ├── rlhf.py
    │   │   ├── sft.py
    │   │   └── tuner.py
    │   └── utils.py
    ├── megatron
    │   ├── __init__.py
    │   ├── argument
    │   │   ├── __init__.py
    │   │   ├── megatron_args.py
    │   │   └── train_args.py
    │   ├── init.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── constant.py
    │   │   ├── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── config.py
    │   │   │   ├── hf2mcore.py
    │   │   │   ├── mcore2hf.py
    │   │   │   └── model.py
    │   │   ├── register.py
    │   │   └── rope.py
    │   ├── train
    │   │   ├── __init__.py
    │   │   ├── patcher.py
    │   │   ├── pt.py
    │   │   ├── sft.py
    │   │   └── utils.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── convert.py
    │   │   └── patcher.py
    ├── plugin
    │   ├── __init__.py
    │   ├── agent_template
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── extra.py
    │   │   ├── glm4.py
    │   │   ├── hermes.py
    │   │   ├── llama.py
    │   │   ├── qwen.py
    │   │   ├── react.py
    │   │   └── toolbench.py
    │   ├── callback.py
    │   ├── loss.py
    │   ├── loss_scale
    │   │   ├── __init__.py
    │   │   ├── config
    │   │   │   ├── agentflan.json
    │   │   │   ├── alpha_umi.json
    │   │   │   ├── hermes.json
    │   │   │   ├── ignore_empty_think.json
    │   │   │   ├── qwen.json
    │   │   │   └── react.json
    │   │   ├── loss_scale.py
    │   │   └── utils.py
    │   ├── metric.py
    │   ├── multi_turn.py
    │   ├── optimizer.py
    │   ├── orm.py
    │   ├── prm.py
    │   ├── rm_plugin.py
    │   └── tuner.py
    ├── trainers
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── callback.py
    │   ├── mixin.py
    │   ├── optimizers
    │   │   ├── __init__.py
    │   │   └── galore
    │   │   │   ├── __init__.py
    │   │   │   ├── adafactor.py
    │   │   │   ├── adamw.py
    │   │   │   ├── adamw8bit.py
    │   │   │   ├── galore_projector.py
    │   │   │   └── utils.py
    │   ├── rlhf_arguments.py
    │   ├── rlhf_trainer
    │   │   ├── __init__.py
    │   │   ├── cpo_trainer.py
    │   │   ├── dpo_trainer.py
    │   │   ├── grpo_trainer.py
    │   │   ├── kto_trainer.py
    │   │   ├── orpo_trainer.py
    │   │   ├── ppo_trainer.py
    │   │   ├── reward_trainer.py
    │   │   ├── rlhf_mixin.py
    │   │   ├── utils.py
    │   │   └── vllm_client.py
    │   ├── sequence_parallel
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── ulysses.py
    │   │   └── xtuner.py
    │   ├── torchacc_mixin.py
    │   ├── trainer_factory.py
    │   ├── trainers.py
    │   └── utils.py
    ├── tuners
    │   ├── __init__.py
    │   ├── adapter.py
    │   ├── base.py
    │   ├── llamapro.py
    │   ├── longlora
    │   │   ├── __init__.py
    │   │   ├── llama.py
    │   │   └── longlora.py
    │   ├── lora.py
    │   ├── lora_layers.py
    │   ├── mapping.py
    │   ├── neftune.py
    │   ├── part.py
    │   ├── peft.py
    │   ├── prompt.py
    │   ├── reft.py
    │   ├── restuning.py
    │   ├── restuning_components.py
    │   ├── scetuning
    │   │   ├── __init__.py
    │   │   ├── scetuning.py
    │   │   └── scetuning_components.py
    │   ├── side.py
    │   └── utils.py
    ├── ui
    │   ├── __init__.py
    │   ├── app.py
    │   ├── base.py
    │   ├── llm_eval
    │   │   ├── __init__.py
    │   │   ├── eval.py
    │   │   ├── llm_eval.py
    │   │   ├── model.py
    │   │   └── runtime.py
    │   ├── llm_export
    │   │   ├── __init__.py
    │   │   ├── export.py
    │   │   ├── llm_export.py
    │   │   ├── model.py
    │   │   └── runtime.py
    │   ├── llm_grpo
    │   │   ├── __init__.py
    │   │   ├── grpo_advanced.py
    │   │   ├── llm_grpo.py
    │   │   ├── model.py
    │   │   ├── ref_model.py
    │   │   ├── reward.py
    │   │   └── rollout.py
    │   ├── llm_infer
    │   │   ├── __init__.py
    │   │   ├── generate.py
    │   │   ├── llm_infer.py
    │   │   ├── model.py
    │   │   └── runtime.py
    │   └── llm_train
    │   │   ├── __init__.py
    │   │   ├── advanced.py
    │   │   ├── dataset.py
    │   │   ├── galore.py
    │   │   ├── hyper.py
    │   │   ├── lisa.py
    │   │   ├── llamapro.py
    │   │   ├── llm_train.py
    │   │   ├── lora.py
    │   │   ├── model.py
    │   │   ├── quantization.py
    │   │   ├── report_to.py
    │   │   ├── rlhf.py
    │   │   ├── runtime.py
    │   │   ├── save.py
    │   │   ├── self_cog.py
    │   │   └── utils.py
    ├── utils
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── env.py
    │   ├── import_utils.py
    │   ├── io_utils.py
    │   ├── logger.py
    │   ├── np_utils.py
    │   ├── tb_utils.py
    │   ├── torch_utils.py
    │   ├── torchacc_utils.py
    │   └── utils.py
    └── version.py
└── tests
    ├── __init__.py
    ├── app
        └── test_app.py
    ├── deploy
        ├── test_dataset.py
        └── test_logprobs.py
    ├── eval
        └── test_eval.py
    ├── export
        └── test_quant.py
    ├── general
        ├── test_arch.py
        ├── test_dataset.py
        ├── test_model.py
        ├── test_stream.py
        └── test_template.py
    ├── hub
        ├── __init__.py
        └── test_check_model.py
    ├── infer
        ├── test_agent.py
        ├── test_infer.py
        ├── test_logprobs.py
        ├── test_main.py
        ├── test_max_memory.py
        └── test_mllm.py
    ├── llm
        ├── __init__.py
        ├── config
        │   ├── infer.json
        │   └── sft.json
        ├── data
        │   ├── alpaca.csv
        │   ├── alpaca.jsonl
        │   ├── alpaca2.csv
        │   ├── chatml.jsonl
        │   ├── conversations.jsonl
        │   ├── multi_modal_1.jsonl
        │   ├── multi_modal_2.jsonl
        │   ├── multi_modal_3.jsonl
        │   ├── sharegpt.jsonl
        │   ├── swift_multi.json
        │   ├── swift_multi.jsonl
        │   ├── swift_pre.csv
        │   ├── swift_pre.jsonl
        │   ├── swift_single.csv
        │   └── swift_single.jsonl
        ├── load_model.py
        ├── load_template.py
        ├── test_custom.py
        ├── test_dataset.py
        ├── test_ollama_export.py
        ├── test_run.py
        ├── test_run3.py
        ├── test_template.py
        └── test_utils.py
    ├── megatron
        ├── test_align
        │   └── test_llm.py
        ├── test_export.py
        ├── test_model.py
        ├── test_save.py
        └── test_train.py
    ├── model_tag.py
    ├── models
        ├── test_flash_attn.py
        ├── test_llm.py
        └── test_mllm.py
    ├── run.py
    ├── run_config.yaml
    ├── sample
        └── test_client.py
    ├── test_align
        ├── test_cls.py
        ├── test_lmdeploy_vlm.py
        ├── test_padding_side.py
        ├── test_rlhf_loss.py
        ├── test_template
        │   ├── test_agent.py
        │   ├── test_audio.py
        │   ├── test_gene.py
        │   ├── test_llm.py
        │   ├── test_template.py
        │   ├── test_tool.py
        │   ├── test_video.py
        │   └── test_vision.py
        └── test_vllm_vlm.py
    ├── test_utils.py
    ├── train
        ├── test_cls.py
        ├── test_freeze.py
        ├── test_grounding.py
        ├── test_grpo.py
        ├── test_kto.py
        ├── test_liger.py
        ├── test_multilabel.py
        ├── test_packing.py
        ├── test_ppo.py
        ├── test_pt.py
        ├── test_rlhf.py
        ├── test_sample.py
        ├── test_sft.py
        ├── test_train_eval.py
        └── test_vit_lr.py
    ├── tuners
        ├── __init__.py
        ├── test_extra_state_dict.py
        ├── test_merged_linear.py
        ├── test_neft.py
        ├── test_peft.py
        ├── test_scetuning.py
        ├── test_swift_base.py
        ├── test_swift_device_map.py
        └── test_swift_restuning.py
    └── utils
        ├── __init__.py
        ├── test_file_utils.py
        ├── test_io_utils.py
        ├── test_split_str_parts_by.py
        └── test_torch_utils.py


/.dev_scripts/build_docs.sh:
--------------------------------------------------------------------------------
1 | pip install -r requirements/docs.txt
2 | cd docs
3 | rm -rf build
4 | 
5 | # update api rst
6 | #rm -rf source/api/
7 | #sphinx-apidoc --module-first -o source/api/ ../modelscope/
8 | make html
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | What the bug is, and how to reproduce, better with screenshots(描述bug以及复现过程，最好有截图)
12 | 
13 | 
14 | **Your hardware and system info**
15 | Write your system info like CUDA version/system/GPU/torch version here(在这里给出硬件信息和系统信息，如CUDA版本，系统，GPU型号和torch版本等)
16 | 
17 | 
18 | **Additional context**
19 | Add any other context about the problem here(在这里补充其他信息)
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the feature**
11 | Please describe the feature requested here(请在这里描述需求)
12 | 
13 | **Paste any useful information**
14 | Paste any useful information, including papers, github links, etc.(请在这里描述其他有用的信息，比如相关的论文地址，github链接等)
15 | 
16 | **Additional context**
17 | Add any other context or information here(其他信息可以写在这里)
18 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # PR type
 2 | - [ ] Bug Fix
 3 | - [ ] New Feature
 4 | - [ ] Document Updates
 5 | - [ ] More Models or Datasets Support
 6 | 
 7 | # PR information
 8 | 
 9 | Write the detail information belongs to this PR.
10 | 
11 | ## Experiment results
12 | 
13 | Paste your experiment result here(if needed).
14 | 


--------------------------------------------------------------------------------
/.github/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Reporting Security Issues
2 | 
3 | Usually security issues of a deep learning project come from non-standard 3rd packages or continuous running services. If you are suffering from security issues from our project, please consider reporting to us. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
4 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.10
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: '3.10'
18 |       - name: Install pre-commit hook
19 |         run: |
20 |           pip install pre-commit
21 |       - name: Linting
22 |         run: pre-commit run --all-files
23 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v**'
 7 | 
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref }}-publish
10 |   cancel-in-progress: true
11 | 
12 | jobs:
13 |   build-n-publish:
14 |     runs-on: ubuntu-22.04
15 |     #if: startsWith(github.event.ref, 'refs/tags')
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Set up Python 3.10
19 |         uses: actions/setup-python@v2
20 |         with:
21 |           python-version: '3.10'
22 |       - name: Install wheel
23 |         run: pip install wheel packaging setuptools==69.5.1
24 |       - name: Build ModelScope Swift
25 |         run: python setup.py sdist bdist_wheel
26 |       - name: Publish package to PyPI
27 |         run: |
28 |           pip install twine
29 |           twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include swift/utils *.py
2 | recursive-include swift/llm/dataset/data *.*
3 | recursive-include swift/llm/ds_config *.json
4 | recursive-include requirements *.txt
5 | recursive-include swift/plugin/loss_scale/config *.json
6 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | WHL_BUILD_DIR :=package
 2 | DOC_BUILD_DIR :=docs/build/
 3 | 
 4 | # default rule
 5 | default: whl docs
 6 | 
 7 | .PHONY: docs
 8 | docs:
 9 | 	bash .dev_scripts/build_docs.sh
10 | 
11 | .PHONY: linter
12 | linter:
13 | 	bash .dev_scripts/linter.sh
14 | 
15 | .PHONY: test
16 | test:
17 | 	bash .dev_scripts/citest.sh
18 | 
19 | .PHONY: whl
20 | whl:
21 | 	python setup.py sdist bdist_wheel
22 | 
23 | .PHONY: clean
24 | clean:
25 | 	rm -rf  $(WHL_BUILD_DIR) $(DOC_BUILD_DIR)
26 | 


--------------------------------------------------------------------------------
/asset/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/asset/banner.png


--------------------------------------------------------------------------------
/asset/discord_qr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/asset/discord_qr.jpg


--------------------------------------------------------------------------------
/asset/wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/asset/wechat.png


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/resources/dpo_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/dpo_data.png


--------------------------------------------------------------------------------
/docs/resources/grpo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo.png


--------------------------------------------------------------------------------
/docs/resources/grpo_clevr_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_clevr_count.png


--------------------------------------------------------------------------------
/docs/resources/grpo_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_code.png


--------------------------------------------------------------------------------
/docs/resources/grpo_countdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_countdown.png


--------------------------------------------------------------------------------
/docs/resources/grpo_countdown_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_countdown_1.png


--------------------------------------------------------------------------------
/docs/resources/grpo_geoqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_geoqa.png


--------------------------------------------------------------------------------
/docs/resources/grpo_openr1_multimodal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/grpo_openr1_multimodal.png


--------------------------------------------------------------------------------
/docs/resources/kto_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/kto_data.png


--------------------------------------------------------------------------------
/docs/resources/web-ui-en.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/web-ui-en.jpg


--------------------------------------------------------------------------------
/docs/resources/web-ui.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/docs/resources/web-ui.jpg


--------------------------------------------------------------------------------
/docs/source/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 | 
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 |   configuration: docs/source/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF and ePub
19 | # formats:
20 | #    - pdf
21 | #    - epub
22 | 
23 | # Optional but recommended, declare the Python requirements required
24 | # to build your documentation
25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
26 | python:
27 |    install:
28 |       - requirements: requirements/docs.txt
29 |       - requirements: requirements/framework.txt
30 | 


--------------------------------------------------------------------------------
/docs/source/BestPractices/更多最佳实践.md:
--------------------------------------------------------------------------------
1 | 
2 | # 更多最佳实践
3 | 
4 | - [Qwen2.5自我认知微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition)
5 | - [Qwen2-VL Latex-OCR微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2vl-ocr)
6 | - [Qwen2.5-VL Grounding任务微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-vl-grounding)
7 | - [Qwen3全流程最佳实践](https://github.com/modelscope/ms-swift/issues/4030)
8 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: {{ module }}
 2 | 
 3 | 
 4 | {{ name | underline}}
 5 | 
 6 | .. autoclass:: {{ name }}
 7 |     :inherited-members:
 8 |     :members:
 9 | 
10 | .. autogenerated from source/_templates/autosummary/class.rst
11 | 


--------------------------------------------------------------------------------
/docs/source/_templates/classtemplate.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: {{ module }}
 2 | 
 3 | 
 4 | {{ name | underline}}
 5 | 
 6 | .. autoclass:: {{ name }}
 7 |     :members:
 8 |     :special-members: __init__, __call__
 9 | 
10 | ..
11 |   autogenerated from source/_templates/classtemplate.rst
12 |   note it does not have :inherited-members:
13 | 


--------------------------------------------------------------------------------
/docs/source/_templates/sobolengine.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: {{ module }}
 2 | 
 3 | 
 4 | {{ name | underline}}
 5 | 
 6 | .. autoclass:: {{ name }}
 7 |     :members:
 8 |     :exclude-members: MAXBIT, MAXDIM
 9 |     :undoc-members:
10 | 
11 | 
12 | ..
13 |   autogenerated from source/_templates/sobolengine.rst
14 |   note it has specific options
15 | 


--------------------------------------------------------------------------------
/docs/source_en/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.12"
13 | 
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 |   configuration: docs/source_en/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF and ePub
19 | # formats:
20 | #    - pdf
21 | #    - epub
22 | 
23 | # Optional but recommended, declare the Python requirements required
24 | # to build your documentation
25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
26 | python:
27 |    install:
28 |       - requirements: requirements/docs.txt
29 |       - requirements: requirements/framework.txt
30 | 


--------------------------------------------------------------------------------
/docs/source_en/BestPractices/More-Best-Practices.md:
--------------------------------------------------------------------------------
1 | 
2 | # More Best Practices
3 | 
4 | - [Qwen2.5 self-cognition SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition)
5 | - [Qwen2-VL Latex-OCR SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2vl-ocr)
6 | - [Qwen2.5-VL Grounding Task SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-vl-grounding)
7 | - [Qwen3全流程最佳实践](https://github.com/modelscope/ms-swift/issues/4030)
8 | 


--------------------------------------------------------------------------------
/docs/source_en/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: {{ module }}
 2 | 
 3 | 
 4 | {{ name | underline}}
 5 | 
 6 | .. autoclass:: {{ name }}
 7 |     :inherited-members:
 8 |     :members:
 9 | 
10 | .. autogenerated from source/_templates/autosummary/class.rst
11 | 


--------------------------------------------------------------------------------
/docs/source_en/_templates/classtemplate.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: {{ module }}
 2 | 
 3 | 
 4 | {{ name | underline}}
 5 | 
 6 | .. autoclass:: {{ name }}
 7 |     :members:
 8 |     :special-members: __init__, __call__
 9 | 
10 | ..
11 |   autogenerated from source/_templates/classtemplate.rst
12 |   note it does not have :inherited-members:
13 | 


--------------------------------------------------------------------------------
/docs/source_en/_templates/sobolengine.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: {{ module }}
 2 | 
 3 | 
 4 | {{ name | underline}}
 5 | 
 6 | .. autoclass:: {{ name }}
 7 |     :members:
 8 |     :exclude-members: MAXBIT, MAXDIM
 9 |     :undoc-members:
10 | 
11 | 
12 | ..
13 |   autogenerated from source/_templates/sobolengine.rst
14 |   note it has specific options
15 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions
 2 | 
 3 | The example provides instructions for using SWIFT for training, inference, deployment, evaluation, and quantization. By default, the model will be downloaded from the ModelScope community.
 4 | 
 5 | If you want to use the Huggingface community, you can change the command line like this:
 6 | 
 7 | ```shell
 8 | ...
 9 | swift sft \
10 |     --model <model_id_or_path> \
11 |     --use_hf 1 \
12 |     ...
13 | ```
14 | 


--------------------------------------------------------------------------------
/examples/app/base_url/demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 5 | 
 6 | if __name__ == '__main__':
 7 |     from swift.llm import AppArguments, app_main, DeployArguments, run_deploy
 8 |     # Here's a runnable demo provided.
 9 |     # In a real scenario, you can simply remove the deployed context.
10 |     with run_deploy(
11 |             DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'),
12 |             return_url=True) as url:
13 |         app_main(AppArguments(model='Qwen2.5-1.5B-Instruct', base_url=url, stream=True, max_new_tokens=2048))
14 | 


--------------------------------------------------------------------------------
/examples/app/base_url/demo.sh:
--------------------------------------------------------------------------------
1 | # You need to have a deployed model or api service first
2 | CUDA_VISIBLE_DEVICES=0 swift app \
3 |     --model '<model_name>' \
4 |     --base_url http://127.0.0.1:8000/v1 \
5 |     --stream true \
6 |     --max_new_tokens 2048 \
7 |     --lang zh
8 | 


--------------------------------------------------------------------------------
/examples/app/llm.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 swift app \
2 |     --model Qwen/Qwen2.5-7B-Instruct \
3 |     --stream true \
4 |     --infer_backend vllm \
5 |     --max_new_tokens 2048 \
6 |     --gpu_memory_utilization 0.9 \
7 |     --max_model_len 8192 \
8 |     --lang zh
9 | 


--------------------------------------------------------------------------------
/examples/app/mllm.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | MAX_PIXELS=1003520 \
 3 | VIDEO_MAX_PIXELS=50176 \
 4 | FPS_MAX_FRAMES=12 \
 5 | swift app \
 6 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
 7 |     --stream true \
 8 |     --infer_backend vllm \
 9 |     --gpu_memory_utilization 0.9 \
10 |     --max_model_len 8192 \
11 |     --max_new_tokens 2048 \
12 |     --limit_mm_per_prompt '{"image": 5, "video": 2}' \
13 |     --lang zh
14 | 


--------------------------------------------------------------------------------
/examples/custom/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from typing import Any, Dict, Optional
 3 | 
 4 | from swift.llm import DatasetMeta, ResponsePreprocessor, load_dataset, register_dataset
 5 | 
 6 | 
 7 | class CustomPreprocessor(ResponsePreprocessor):
 8 |     prompt = """Task: Based on the given two sentences, provide a similarity score between 0.0 and 5.0.
 9 | Sentence 1: {text1}
10 | Sentence 2: {text2}
11 | Similarity score: """
12 | 
13 |     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
14 |         return super().preprocess({
15 |             'query': self.prompt.format(text1=row['text1'], text2=row['text2']),
16 |             'response': f"{row['label']:.1f}"
17 |         })
18 | 
19 | 
20 | register_dataset(
21 |     DatasetMeta(
22 |         ms_dataset_id='swift/stsb',
23 |         hf_dataset_id='SetFit/stsb',
24 |         preprocess_func=CustomPreprocessor(),
25 |     ))
26 | 
27 | if __name__ == '__main__':
28 |     dataset = load_dataset(['swift/stsb'])[0]
29 |     print(f'dataset: {dataset}')
30 |     print(f'dataset[0]: {dataset[0]}')
31 | 


--------------------------------------------------------------------------------
/examples/custom/infer.sh:
--------------------------------------------------------------------------------
 1 | # sh examples/custom/infer.sh
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift infer \
 4 |     --adapters output/vx-xxx/checkpoint-xxx \
 5 |     --load_data_args true \
 6 |     --infer_backend pt \
 7 |     --max_batch_size 16 \
 8 |     --max_new_tokens 256 \
 9 |     --temperature 0
10 | 


--------------------------------------------------------------------------------
/examples/custom/sft.sh:
--------------------------------------------------------------------------------
 1 | # sh examples/custom/sft.sh
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --custom_register_path examples/custom/dataset.py \
 5 |                            examples/custom/model.py \
 6 |     --model AI-ModelScope/Nemotron-Mini-4B-Instruct \
 7 |     --train_type lora \
 8 |     --dataset swift/stsb \
 9 |     --num_train_epochs 3 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 1 \
12 |     --learning_rate 1e-4 \
13 |     --lora_rank 8 \
14 |     --lora_alpha 32 \
15 |     --target_modules all-linear \
16 |     --gradient_accumulation_steps 16 \
17 |     --eval_steps 100 \
18 |     --save_steps 100 \
19 |     --save_total_limit 2 \
20 |     --logging_steps 5 \
21 |     --warmup_ratio 0.05 \
22 |     --dataloader_num_workers 4 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --dataset_num_proc 4
26 | 


--------------------------------------------------------------------------------
/examples/deploy/agent/server.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 swift deploy \
2 |     --model Qwen/Qwen2.5-7B-Instruct \
3 |     --infer_backend vllm \
4 |     --gpu_memory_utilization 0.9 \
5 |     --max_model_len 8192 \
6 |     --max_new_tokens 2048 \
7 |     --agent_template hermes \
8 |     --served_model_name Qwen2.5-7B-Instruct
9 | 


--------------------------------------------------------------------------------
/examples/deploy/bert/client.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from swift.llm import InferClient, InferRequest
 4 | 
 5 | 
 6 | def infer_batch(engine: InferClient, infer_requests: List[InferRequest]):
 7 |     resp_list = engine.infer(infer_requests)
 8 |     query0 = infer_requests[0].messages[0]['content']
 9 |     query1 = infer_requests[1].messages[0]['content']
10 |     print(f'query0: {query0}')
11 |     print(f'response0: {resp_list[0].choices[0].message.content}')
12 |     print(f'query1: {query1}')
13 |     print(f'response1: {resp_list[1].choices[0].message.content}')
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     engine = InferClient(host='127.0.0.1', port=8000)
18 |     models = engine.models
19 |     print(f'models: {models}')
20 |     infer_batch(engine, [
21 |         InferRequest(messages=[{
22 |             'role': 'user',
23 |             'content': '今天天气真好呀'
24 |         }]),
25 |         InferRequest(messages=[{
26 |             'role': 'user',
27 |             'content': '真倒霉'
28 |         }])
29 |     ])
30 | 


--------------------------------------------------------------------------------
/examples/deploy/bert/server.sh:
--------------------------------------------------------------------------------
 1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file,
 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
 3 | CUDA_VISIBLE_DEVICES=0 swift deploy \
 4 |     --host 0.0.0.0 \
 5 |     --port 8000 \
 6 |     --adapters swift/test_bert \
 7 |     --served_model_name bert-base-chinese \
 8 |     --infer_backend pt \
 9 |     --truncation_strategy right \
10 |     --max_length 512
11 | 


--------------------------------------------------------------------------------
/examples/deploy/lora/client.py:
--------------------------------------------------------------------------------
 1 | from swift.llm import InferClient, InferRequest, RequestConfig
 2 | 
 3 | 
 4 | def infer_multilora(engine: InferClient, infer_request: InferRequest):
 5 |     # Dynamic LoRA
 6 |     models = engine.models
 7 |     print(f'models: {models}')
 8 |     request_config = RequestConfig(max_tokens=512, temperature=0)
 9 | 
10 |     # use lora1
11 |     resp_list = engine.infer([infer_request], request_config, model=models[1])
12 |     response = resp_list[0].choices[0].message.content
13 |     print(f'lora1-response: {response}')
14 |     # origin model
15 |     resp_list = engine.infer([infer_request], request_config, model=models[0])
16 |     response = resp_list[0].choices[0].message.content
17 |     print(f'response: {response}')
18 |     # use lora2
19 |     resp_list = engine.infer([infer_request], request_config, model=models[2])
20 |     response = resp_list[0].choices[0].message.content
21 |     print(f'lora2-response: {response}')
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     engine = InferClient(host='127.0.0.1', port=8000)
26 |     infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
27 |     infer_multilora(engine, infer_request)
28 | 


--------------------------------------------------------------------------------
/examples/deploy/lora/server.sh:
--------------------------------------------------------------------------------
1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file,
2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
3 | CUDA_VISIBLE_DEVICES=0 swift deploy \
4 |     --host 0.0.0.0 \
5 |     --port 8000 \
6 |     --adapters lora1=swift/test_lora lora2=swift/test_lora2 \
7 |     --infer_backend vllm
8 | 


--------------------------------------------------------------------------------
/examples/deploy/reward_model/client.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from swift.llm import InferClient, InferRequest
 3 | 
 4 | if __name__ == '__main__':
 5 |     engine = InferClient(host='127.0.0.1', port=8000)
 6 |     models = engine.models
 7 |     print(f'models: {models}')
 8 |     messages = [{
 9 |         'role': 'user',
10 |         'content': "Hello! What's your name?"
11 |     }, {
12 |         'role': 'assistant',
13 |         'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?'
14 |     }]
15 |     resp_list = engine.infer([InferRequest(messages=messages)])
16 |     print(f'messages: {messages}')
17 |     print(f'response: {resp_list[0].choices[0].message.content}')
18 | 


--------------------------------------------------------------------------------
/examples/deploy/reward_model/server.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 swift deploy \
2 |     --host 0.0.0.0 \
3 |     --port 8000 \
4 |     --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
5 |     --infer_backend pt
6 | 


--------------------------------------------------------------------------------
/examples/deploy/server/README.md:
--------------------------------------------------------------------------------
 1 | Please refer to the examples in [examples/infer](../../infer/) and change `swift infer` to `swift deploy` to start the service. (You need to additionally remove `--val_dataset`)
 2 | 
 3 | e.g.
 4 | ```shell
 5 | CUDA_VISIBLE_DEVICES=0 \
 6 | swift deploy \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --infer_backend vllm
 9 | ```
10 | 


--------------------------------------------------------------------------------
/examples/deploy/server/demo.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 swift deploy \
 2 |     --model Qwen/Qwen2.5-7B-Instruct \
 3 |     --infer_backend vllm \
 4 |     --served_model_name Qwen2.5-7B-Instruct
 5 | 
 6 | # After the server-side deployment above is successful, use the command below to perform a client call test.
 7 | 
 8 | # curl http://localhost:8000/v1/chat/completions \
 9 | # -H "Content-Type: application/json" \
10 | # -d '{
11 | # "model": "Qwen2.5-7B-Instruct",
12 | # "messages": [{"role": "user", "content": "What is your name?"}],
13 | # "temperature": 0
14 | # }'
15 | 


--------------------------------------------------------------------------------
/examples/eval/eval_url/demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 5 | 
 6 | if __name__ == '__main__':
 7 |     from swift.llm import EvalArguments, eval_main, run_deploy, DeployArguments
 8 |     # Here's a runnable demo provided. Use the eval_url method for evaluation.
 9 |     # In a real scenario, you can simply remove the deployed context.
10 |     print(EvalArguments.list_eval_dataset())
11 |     with run_deploy(
12 |             DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'),
13 |             return_url=True) as url:
14 |         eval_main(EvalArguments(model='Qwen2.5-1.5B-Instruct', eval_url=url, eval_dataset=['arc']))
15 | 


--------------------------------------------------------------------------------
/examples/eval/eval_url/eval.sh:
--------------------------------------------------------------------------------
1 | # You need to have a deployed model or api service first
2 | swift eval \
3 |     --model '<model_name>' \
4 |     --eval_backend OpenCompass \
5 |     --eval_url http://127.0.0.1:8000/v1 \
6 |     --eval_limit 100 \
7 |     --eval_dataset gsm8k
8 | 


--------------------------------------------------------------------------------
/examples/eval/llm/eval.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift eval \
3 |     --model Qwen/Qwen2.5-1.5B-Instruct \
4 |     --eval_backend OpenCompass \
5 |     --infer_backend vllm \
6 |     --eval_limit 100 \
7 |     --eval_dataset gsm8k
8 | 


--------------------------------------------------------------------------------
/examples/eval/train_eval/train.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |   --model "Qwen/Qwen2.5-0.5B-Instruct" \
 4 |   --train_type "lora" \
 5 |   --dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \
 6 |   --torch_dtype "bfloat16" \
 7 |   --num_train_epochs "1" \
 8 |   --per_device_train_batch_size "1" \
 9 |   --learning_rate "1e-4" \
10 |   --lora_rank "8" \
11 |   --lora_alpha "32" \
12 |   --target_modules "all-linear" \
13 |   --gradient_accumulation_steps "16" \
14 |   --save_steps "50" \
15 |   --save_total_limit "5" \
16 |   --logging_steps "5" \
17 |   --max_length "2048" \
18 |   --eval_strategy "steps" \
19 |   --eval_steps "5" \
20 |   --per_device_eval_batch_size "5" \
21 |   --eval_use_evalscope \
22 |   --eval_datasets "gsm8k" \
23 |   --eval_datasets_args '{"gsm8k": {"few_shot_num": 0}}' \
24 |   --eval_limit "10"
25 | 


--------------------------------------------------------------------------------
/examples/eval/vlm/eval.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | MAX_PIXELS=1003520 \
3 | swift eval \
4 |   --model Qwen/Qwen2-VL-2B-Instruct \
5 |   --infer_backend vllm \
6 |   --eval_limit 100 \
7 |   --eval_dataset realWorldQA \
8 |   --eval_backend VLMEvalKit
9 | 


--------------------------------------------------------------------------------
/examples/export/merge_lora.sh:
--------------------------------------------------------------------------------
1 | # Since `output/vx-xxx/checkpoint-xxx` is trained by swift and contains an `args.json` file,
2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
3 | swift export \
4 |     --adapters output/vx-xxx/checkpoint-xxx \
5 |     --merge_lora true
6 | 


--------------------------------------------------------------------------------
/examples/export/ollama.sh:
--------------------------------------------------------------------------------
1 | swift export \
2 |     --model Qwen/Qwen2.5-1.5B-Instruct \
3 |     --to_ollama true \
4 |     --output_dir Qwen2.5-1.5B-Instruct-ollama
5 | 


--------------------------------------------------------------------------------
/examples/export/push_to_hub.sh:
--------------------------------------------------------------------------------
1 | swift export \
2 |     --adapters output/vx-xxx/checkpoint-xxx \
3 |     --push_to_hub true \
4 |     --hub_model_id '<model-id>' \
5 |     --hub_token '<sdk-token>' \
6 |     --use_hf false
7 | 


--------------------------------------------------------------------------------
/examples/export/quantize/awq.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift export \
 3 |     --model Qwen/Qwen2.5-72B-Instruct \
 4 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 5 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 6 |     --device_map cpu \
 7 |     --quant_n_samples 256 \
 8 |     --quant_batch_size 1 \
 9 |     --max_length 2048 \
10 |     --quant_method awq \
11 |     --quant_bits 4 \
12 |     --output_dir Qwen2.5-72B-Instruct-AWQ
13 | 


--------------------------------------------------------------------------------
/examples/export/quantize/bert/bnb.sh:
--------------------------------------------------------------------------------
 1 | # merge-lora
 2 | CUDA_VISIBLE_DEVICES=0 swift export \
 3 |     --adapters swift/test_bert \
 4 |     --output_dir output/swift_test_bert_merged \
 5 |     --merge_lora true
 6 | 
 7 | # bnb quantize
 8 | CUDA_VISIBLE_DEVICES=0 swift export \
 9 |     --model output/swift_test_bert_merged \
10 |     --output_dir output/swift_test_bert_bnb_int4 \
11 |     --quant_bits 4 \
12 |     --quant_method bnb
13 | 
14 | # infer
15 | CUDA_VISIBLE_DEVICES=0 swift infer \
16 |     --model output/swift_test_bert_bnb_int4
17 | 


--------------------------------------------------------------------------------
/examples/export/quantize/bert/gptq.sh:
--------------------------------------------------------------------------------
 1 | # merge-lora
 2 | CUDA_VISIBLE_DEVICES=0 swift export \
 3 |     --adapters swift/test_bert \
 4 |     --output_dir output/swift_test_bert_merged \
 5 |     --merge_lora true
 6 | 
 7 | # gptq quantize
 8 | CUDA_VISIBLE_DEVICES=0 swift export \
 9 |     --model output/swift_test_bert_merged \
10 |     --load_data_args true \
11 |     --output_dir output/swift_test_bert_gptq_int4 \
12 |     --quant_bits 4 \
13 |     --quant_method gptq \
14 |     --max_length 512
15 | 
16 | # infer
17 | CUDA_VISIBLE_DEVICES=0 swift infer \
18 |     --model output/swift_test_bert_gptq_int4
19 | 


--------------------------------------------------------------------------------
/examples/export/quantize/bnb.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift export \
3 |     --model Qwen/Qwen2.5-1.5B-Instruct \
4 |     --quant_method bnb \
5 |     --quant_bits 4 \
6 |     --bnb_4bit_quant_type nf4 \
7 |     --bnb_4bit_use_double_quant true \
8 |     --output_dir Qwen2.5-1.5B-Instruct-BNB-NF4
9 | 


--------------------------------------------------------------------------------
/examples/export/quantize/gptq.sh:
--------------------------------------------------------------------------------
 1 | # OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
 2 | OMP_NUM_THREADS=14 \
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | swift export \
 5 |     --model Qwen/Qwen2.5-1.5B-Instruct \
 6 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 7 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 8 |     --quant_n_samples 256 \
 9 |     --quant_batch_size 1 \
10 |     --max_length 2048 \
11 |     --quant_method gptq \
12 |     --quant_bits 4 \
13 |     --output_dir Qwen2.5-1.5B-Instruct-GPTQ-Int4
14 | 


--------------------------------------------------------------------------------
/examples/export/quantize/mllm/awq.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | MAX_PIXELS=1003520 \
 3 | VIDEO_MAX_PIXELS=50176 \
 4 | FPS_MAX_FRAMES=12 \
 5 | swift export \
 6 |     --model Qwen/Qwen2.5-VL-3B-Instruct \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 9 |               'modelscope/coco_2014_caption:validation#500' \
10 |               'swift/VideoChatGPT:Generic#500' \
11 |     --quant_n_samples 256 \
12 |     --quant_batch_size -1 \
13 |     --max_length 2048 \
14 |     --quant_method awq \
15 |     --quant_bits 4 \
16 |     --output_dir Qwen2.5-VL-3B-Instruct-AWQ
17 | 


--------------------------------------------------------------------------------
/examples/export/quantize/mllm/gptq.sh:
--------------------------------------------------------------------------------
 1 | # OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
 2 | OMP_NUM_THREADS=14 \
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | MAX_PIXELS=1003520 \
 5 | VIDEO_MAX_PIXELS=50176 \
 6 | FPS_MAX_FRAMES=12 \
 7 | swift export \
 8 |     --model Qwen/Qwen2.5-VL-3B-Instruct \
 9 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
10 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
11 |               'modelscope/coco_2014_caption:validation#500' \
12 |               'swift/VideoChatGPT:Generic#500' \
13 |     --quant_n_samples 256 \
14 |     --quant_batch_size 1 \
15 |     --max_length 2048 \
16 |     --quant_method gptq \
17 |     --quant_bits 4 \
18 |     --output_dir Qwen2.5-VL-3B-Instruct-GPTQ-Int4
19 | 


--------------------------------------------------------------------------------
/examples/export/quantize/moe/awq.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1 \
 2 | swift export \
 3 |     --model Qwen/Qwen3-30B-A3B \
 4 |     --dataset 'swift/Qwen3-SFT-Mixin' \
 5 |     --device_map auto \
 6 |     --quant_n_samples 64 \
 7 |     --quant_batch_size -1 \
 8 |     --max_length 8192 \
 9 |     --quant_method awq \
10 |     --quant_bits 4 \
11 |     --output_dir Qwen3-30B-A3B-AWQ
12 | 


--------------------------------------------------------------------------------
/examples/export/quantize/moe/gptq.sh:
--------------------------------------------------------------------------------
 1 | # 2 * 80GB
 2 | OMP_NUM_THREADS=14 \
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | swift export \
 5 |     --model Qwen/Qwen2-57B-A14B-Instruct \
 6 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \
 7 |               'AI-ModelScope/alpaca-gpt4-data-en#1000' \
 8 |     --quant_n_samples 512 \
 9 |     --quant_batch_size 1 \
10 |     --max_length 4096 \
11 |     --quant_method gptq \
12 |     --quant_bits 4 \
13 |     --output_dir Qwen2-57B-A14B-Instruct-GPTQ-Int4
14 | 


--------------------------------------------------------------------------------
/examples/export/quantize/omni/gptq.sh:
--------------------------------------------------------------------------------
 1 | # OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
 2 | OMP_NUM_THREADS=14 \
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | MAX_PIXELS=1003520 \
 5 | VIDEO_MAX_PIXELS=50176 \
 6 | FPS_MAX_FRAMES=12 \
 7 | swift export \
 8 |     --model Qwen/Qwen2.5-Omni-7B \
 9 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
10 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
11 |               'modelscope/coco_2014_caption:validation#500' \
12 |               'swift/VideoChatGPT:Generic#500' \
13 |     --quant_n_samples 256 \
14 |     --quant_batch_size 1 \
15 |     --max_length 2048 \
16 |     --quant_method gptq \
17 |     --quant_bits 4 \
18 |     --output_dir Qwen2.5-Omni-7B-GPTQ-Int4
19 | 


--------------------------------------------------------------------------------
/examples/export/quantize/reward_model/bnb.sh:
--------------------------------------------------------------------------------
 1 | # bnb quantize
 2 | CUDA_VISIBLE_DEVICES=0 swift export \
 3 |     --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
 4 |     --output_dir output/internlm2-1_8b-reward-bnb-int4 \
 5 |     --quant_bits 4 \
 6 |     --quant_method bnb
 7 | 
 8 | # infer
 9 | CUDA_VISIBLE_DEVICES=0 swift infer \
10 |     --model output/internlm2-1_8b-reward-bnb-int4 \
11 |     --val_dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \
12 |     --max_batch_size 16
13 | 


--------------------------------------------------------------------------------
/examples/export/quantize/reward_model/gptq.sh:
--------------------------------------------------------------------------------
 1 | # gptq quantize
 2 | CUDA_VISIBLE_DEVICES=0 swift export \
 3 |     --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
 4 |     --output_dir output/internlm2-1_8b-reward-gptq-int4 \
 5 |     --quant_bits 4 \
 6 |     --quant_method gptq \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' 'AI-ModelScope/alpaca-gpt4-data-en#1000'
 8 | 
 9 | # infer
10 | CUDA_VISIBLE_DEVICES=0 swift infer \
11 |     --model output/internlm2-1_8b-reward-gptq-int4 \
12 |     --val_dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \
13 |     --max_batch_size 16
14 | 


--------------------------------------------------------------------------------
/examples/infer/cli_demo.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift infer \
3 |     --model Qwen/Qwen2.5-1.5B-Instruct \
4 |     --infer_backend pt \
5 |     --stream true \
6 |     --max_new_tokens 2048
7 | 


--------------------------------------------------------------------------------
/examples/infer/lmdeploy/mllm_tp.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1 \
2 | swift infer \
3 |     --model OpenGVLab/InternVL2_5-1B \
4 |     --infer_backend lmdeploy \
5 |     --val_dataset AI-ModelScope/captcha-images#1000 \
6 |     --tp 2 \
7 |     --vision_batch_size 8 \
8 |     --max_new_tokens 2048
9 | 


--------------------------------------------------------------------------------
/examples/infer/pt/batch_ddp.sh:
--------------------------------------------------------------------------------
 1 | # 18GB
 2 | NPROC_PER_NODE=4 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 4 | swift infer \
 5 |     --model Qwen/Qwen2.5-1.5B-Instruct \
 6 |     --infer_backend pt \
 7 |     --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#1000 \
 8 |     --max_batch_size 16 \
 9 |     --max_new_tokens 512
10 | 


--------------------------------------------------------------------------------
/examples/infer/pt/bert.sh:
--------------------------------------------------------------------------------
1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file,
2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
3 | # To disable this behavior, please set `--load_args false`.
4 | CUDA_VISIBLE_DEVICES=0 \
5 | swift infer \
6 |     --adapters swift/test_bert \
7 |     --truncation_strategy right \
8 |     --max_length 512
9 | 


--------------------------------------------------------------------------------
/examples/infer/pt/lora.sh:
--------------------------------------------------------------------------------
 1 | # Since `swift/test_lora` is trained by swift and contains an `args.json` file,
 2 | # there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
 3 | # To disable this behavior, please set `--load_args false`.
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | swift infer \
 6 |     --adapters swift/test_lora \
 7 |     --infer_backend pt \
 8 |     --stream true \
 9 |     --temperature 0 \
10 |     --max_new_tokens 2048
11 | 


--------------------------------------------------------------------------------
/examples/infer/pt/mllm_device_map.sh:
--------------------------------------------------------------------------------
 1 | NPROC_PER_NODE=2 \
 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 3 | MAX_PIXELS=1003520 \
 4 | swift infer \
 5 |     --model Qwen/Qwen2.5-VL-3B-Instruct \
 6 |     --infer_backend pt \
 7 |     --val_dataset AI-ModelScope/LaTeX_OCR#1000 \
 8 |     --max_batch_size 16 \
 9 |     --max_new_tokens 512
10 | 


--------------------------------------------------------------------------------
/examples/infer/pt/prm.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift infer \
3 |     --model Qwen/Qwen2.5-Math-PRM-7B \
4 |     --infer_backend pt
5 | 


--------------------------------------------------------------------------------
/examples/infer/pt/reward_model.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift infer \
3 |     --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
4 |     --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#1000 \
5 |     --max_batch_size 64
6 | 


--------------------------------------------------------------------------------
/examples/infer/vllm/dp_tp.sh:
--------------------------------------------------------------------------------
 1 | NPROC_PER_NODE=4 \
 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 3 | swift infer \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --infer_backend vllm \
 6 |     --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#2000 \
 7 |     --gpu_memory_utilization 0.9 \
 8 |     --max_model_len 8192 \
 9 |     --tensor_parallel_size 2 \
10 |     --max_new_tokens 2048 \
11 |     --write_batch_size 1000
12 | 


--------------------------------------------------------------------------------
/examples/infer/vllm/mllm_ddp.sh:
--------------------------------------------------------------------------------
 1 | # You need to use flash-attn (manual installation) instead of xformers.
 2 | NPROC_PER_NODE=2 \
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | swift infer \
 5 |     --model Qwen/Qwen2-Audio-7B-Instruct \
 6 |     --infer_backend vllm \
 7 |     --val_dataset speech_asr/speech_asr_aishell1_trainsets:validation#1000 \
 8 |     --gpu_memory_utilization 0.9 \
 9 |     --max_model_len 8192 \
10 |     --max_new_tokens 2048 \
11 |     --limit_mm_per_prompt '{"audio": 5}'
12 | 


--------------------------------------------------------------------------------
/examples/infer/vllm/mllm_tp.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1 \
 2 | MAX_PIXELS=1003520 \
 3 | swift infer \
 4 |     --model Qwen/Qwen2.5-VL-3B-Instruct \
 5 |     --infer_backend vllm \
 6 |     --val_dataset AI-ModelScope/LaTeX_OCR#1000 \
 7 |     --gpu_memory_utilization 0.9 \
 8 |     --tensor_parallel_size 2 \
 9 |     --max_model_len 32768 \
10 |     --max_new_tokens 2048 \
11 |     --limit_mm_per_prompt '{"image": 5, "video": 2}'
12 | 


--------------------------------------------------------------------------------
/examples/notebook/qwen2_5-self-cognition/infer.sh:
--------------------------------------------------------------------------------
1 | # Here is the command-line style inference code.
2 | CUDA_VISIBLE_DEVICES=0 \
3 | swift infer \
4 |     --adapters output/vx-xxx/checkpoint-xxx \
5 |     --stream true \
6 |     --temperature 0 \
7 |     --max_new_tokens 2048
8 | 


--------------------------------------------------------------------------------
/examples/notebook/qwen2_5-self-cognition/sft.sh:
--------------------------------------------------------------------------------
 1 | # Here is the command-line style training code.
 2 | # 22GB
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-3B-Instruct \
 6 |     --train_type lora \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 9 |               'swift/self-cognition#500' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps 16 \
19 |     --eval_steps 50 \
20 |     --save_steps 50 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --system 'You are a helpful assistant.' \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --dataset_num_proc 4 \
29 |     --model_name 小黄 'Xiao Huang' \
30 |     --model_author '魔搭' 'ModelScope'
31 | 


--------------------------------------------------------------------------------
/examples/sampler/distill/distill.sh:
--------------------------------------------------------------------------------
 1 | OPENAI_API_KEY="xxx" \
 2 | swift sample \
 3 |     --sampler_type distill \
 4 |     --sampler_engine client \
 5 |     --model deepseek-r1 \
 6 |     --stream true \
 7 |     --dataset tastelikefeet/competition_math#5 \
 8 |     --num_return_sequences 1 \
 9 |     --temperature 0.6 \
10 |     --top_p 0.95 \
11 |     --engine_kwargs '{"base_url":"https://dashscope.aliyuncs.com/compatible-mode/v1"}'
12 | 


--------------------------------------------------------------------------------
/examples/sampler/mcts/system_prompt.txt:
--------------------------------------------------------------------------------
1 | You are a math model, you should **think step by step** carefully. Each step should **end with \"ки\”**. Final answer should be in a ‘\boxed()’.
2 | 
3 | ## Example:
4 | Step1: XXX. ки\n
5 | Step2: XXX. ки\n
6 | Step3: XXX. ки\n
7 | Answer: \boxed(answer). ки\n
8 | 


--------------------------------------------------------------------------------
/examples/train/agent/deepseek_r1.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |     --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
 4 |     --train_type full \
 5 |     --dataset AI-ModelScope/function-calling-chatml \
 6 |     --agent_template react_en \
 7 |     --loss_scale react \
 8 |     --response_prefix '' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 2 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 8 \
15 |     --eval_steps 100 \
16 |     --save_steps 100 \
17 |     --save_total_limit 2 \
18 |     --logging_steps 5 \
19 |     --max_length 8192 \
20 |     --save_only_model true \
21 |     --packing true \
22 |     --use_liger_kernel true \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --attn_impl flash_attn \
26 |     --dataloader_num_workers 4 \
27 |     --dataset_num_proc 16
28 | 


--------------------------------------------------------------------------------
/examples/train/agent/glm4.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 80GiB
 2 | NPROC_PER_NODE=4 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 4 | swift sft \
 5 |     --model ZhipuAI/GLM-4-9B-0414 \
 6 |     --train_type full \
 7 |     --dataset AI-ModelScope/function-calling-chatml \
 8 |     --agent_template hermes \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 2 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 2 \
15 |     --eval_steps 100 \
16 |     --save_steps 100 \
17 |     --save_total_limit 2 \
18 |     --logging_steps 5 \
19 |     --max_length 8192 \
20 |     --save_only_model true \
21 |     --packing true \
22 |     --deepspeed zero3 \
23 |     --use_liger_kernel true \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --attn_impl flash_attn \
27 |     --dataloader_num_workers 4 \
28 |     --dataset_num_proc 16
29 | 


--------------------------------------------------------------------------------
/examples/train/agent/loss_scale/train.sh:
--------------------------------------------------------------------------------
 1 | # 20GB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-3B \
 5 |     --train_type lora \
 6 |     --dataset AI-ModelScope/function-calling-chatml#10000 \
 7 |     --loss_scale hermes \
 8 |     --agent_template hermes \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 2 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --modules_to_save embed_tokens lm_head \
18 |     --gradient_accumulation_steps 16 \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --use_liger_kernel true \
25 |     --output_dir output \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --dataset_num_proc 16
29 | 


--------------------------------------------------------------------------------
/examples/train/agent/qwen2_5.sh:
--------------------------------------------------------------------------------
 1 | # 35GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-3B \
 5 |     --train_type full \
 6 |     --dataset AI-ModelScope/function-calling-chatml \
 7 |     --agent_template hermes \
 8 |     --torch_dtype bfloat16 \
 9 |     --num_train_epochs 2 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 1 \
12 |     --learning_rate 1e-5 \
13 |     --gradient_accumulation_steps 8 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --max_length 8192 \
19 |     --save_only_model true \
20 |     --packing true \
21 |     --use_liger_kernel true \
22 |     --output_dir output \
23 |     --warmup_ratio 0.05 \
24 |     --attn_impl flash_attn \
25 |     --dataloader_num_workers 4 \
26 |     --dataset_num_proc 16
27 | 


--------------------------------------------------------------------------------
/examples/train/all_to_all/infer.sh:
--------------------------------------------------------------------------------
 1 | # 53GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift infer \
 4 |     --model BAAI/Emu3-Gen \
 5 |     --infer_backend pt \
 6 |     --stream False \
 7 |     --use_chat_template False \
 8 |     --top_k 2048 \
 9 |     --max_new_tokens 40960
10 | 


--------------------------------------------------------------------------------
/examples/train/all_to_all/train.sh:
--------------------------------------------------------------------------------
 1 | # 70 GiB * 2
 2 | nproc_per_node=2
 3 | NPROC_PER_NODE=$nproc_per_node \
 4 | CUDA_VISIBLE_DEVICES=0,2 \
 5 | max_position_embeddings=10240 \
 6 | image_area=518400 \
 7 | swift sft \
 8 |     --model BAAI/Emu3-Gen \
 9 |     --train_type lora \
10 |     --dataset 'swift/TextCaps#40' \
11 |     --torch_dtype bfloat16 \
12 |     --num_train_epochs 10 \
13 |     --per_device_train_batch_size 1 \
14 |     --learning_rate 1e-5 \
15 |     --gradient_accumulation_steps 4 \
16 |     --warmup_ratio 0.03 \
17 |     --eval_steps 500 \
18 |     --save_steps 500 \
19 |     --save_total_limit 2 \
20 |     --logging_steps 5 \
21 |     --max_length 1024 \
22 |     --weight_decay 0.1 \
23 |     --gradient_checkpointing_kwargs '{"use_reentrant": false}'
24 | 


--------------------------------------------------------------------------------
/examples/train/base_to_chat/full.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=2
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | NPROC_PER_NODE=$nproc_per_node \
 5 | swift sft \
 6 |     --model Qwen/Qwen2.5-1.5B \
 7 |     --train_type full \
 8 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 9 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
10 |               'swift/self-cognition' \
11 |     --torch_dtype bfloat16 \
12 |     --num_train_epochs 10 \
13 |     --per_device_train_batch_size 1 \
14 |     --per_device_eval_batch_size 1 \
15 |     --learning_rate 1e-5 \
16 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
17 |     --eval_steps 200 \
18 |     --save_steps 200 \
19 |     --save_total_limit 2 \
20 |     --logging_steps 5 \
21 |     --max_length 2048 \
22 |     --output_dir output \
23 |     --system 'You are a helpful assistant.' \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4 \
26 |     --model_author swift \
27 |     --model_name swift-robot \
28 |     --deepspeed zero2
29 | 


--------------------------------------------------------------------------------
/examples/train/base_to_chat/lora.sh:
--------------------------------------------------------------------------------
 1 | # Use `--template default`
 2 | nproc_per_node=2
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1 \
 5 | MASTER_PORT=29501 \
 6 | NPROC_PER_NODE=$nproc_per_node \
 7 | swift sft \
 8 |     --model Qwen/Qwen2.5-1.5B \
 9 |     --train_type lora \
10 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
11 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
12 |               'swift/self-cognition' \
13 |     --torch_dtype bfloat16 \
14 |     --template default \
15 |     --num_train_epochs 10 \
16 |     --per_device_train_batch_size 1 \
17 |     --per_device_eval_batch_size 1 \
18 |     --learning_rate 1e-4 \
19 |     --lora_rank 8 \
20 |     --lora_alpha 32 \
21 |     --target_modules all-linear \
22 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
23 |     --eval_steps 50 \
24 |     --save_steps 50 \
25 |     --save_total_limit 2 \
26 |     --logging_steps 5 \
27 |     --max_length 2048 \
28 |     --output_dir output \
29 |     --system 'You are a helpful assistant.' \
30 |     --warmup_ratio 0.05 \
31 |     --dataloader_num_workers 4 \
32 |     --model_author swift \
33 |     --model_name swift-robot \
34 |     --deepspeed zero2
35 | 


--------------------------------------------------------------------------------
/examples/train/embedding/train_gme.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=8
 2 | 
 3 | # losses: plugin/loss.py
 4 | # 8*40G
 5 | MAX_PIXELS=1003520 \
 6 | NPROC_PER_NODE=$nproc_per_node \
 7 | swift sft \
 8 |     --model iic/gme-Qwen2-VL-2B-Instruct \
 9 |     --train_type lora \
10 |     --dataset 'swift/TextCaps:emb' \
11 |     --torch_dtype bfloat16 \
12 |     --num_train_epochs 1 \
13 |     --per_device_train_batch_size 2 \
14 |     --per_device_eval_batch_size 2 \
15 |     --gradient_accumulation_steps $(expr 64 / $nproc_per_node) \
16 |     --eval_steps 100 \
17 |     --save_steps 100 \
18 |     --eval_strategy steps \
19 |     --save_total_limit 2 \
20 |     --logging_steps 5 \
21 |     --output_dir output \
22 |     --lazy_tokenize true \
23 |     --warmup_ratio 0.05 \
24 |     --learning_rate 5e-6 \
25 |     --deepspeed zero3 \
26 |     --dataloader_num_workers 4 \
27 |     --task_type embedding \
28 |     --loss_type infonce \
29 |     --dataloader_drop_last true
30 | 


--------------------------------------------------------------------------------
/examples/train/embedding/train_gte.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=8
 2 | # 4*12G
 3 | # losses: plugin/loss.py
 4 | # data format: docs/source_en/Customization/Custom-dataset.md
 5 | # --use_chat_template must be false to use generation template
 6 | # --dataloader_drop_last must be true or eval gather will throw error
 7 | # --model iic/gte-modernbert-base modernbert also supported
 8 | NPROC_PER_NODE=$nproc_per_node \
 9 | swift sft \
10 |     --model iic/gte_Qwen2-7B-instruct \
11 |     --train_type lora \
12 |     --dataset 'sentence-transformers/stsb' \
13 |     --torch_dtype bfloat16 \
14 |     --num_train_epochs 1 \
15 |     --per_device_train_batch_size 2 \
16 |     --per_device_eval_batch_size 1 \
17 |     --gradient_accumulation_steps $(expr 64 / $nproc_per_node) \
18 |     --eval_steps 100 \
19 |     --save_steps 100 \
20 |     --eval_strategy steps \
21 |     --use_chat_template false \
22 |     --save_total_limit 2 \
23 |     --logging_steps 5 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --learning_rate 5e-6 \
27 |     --deepspeed zero3 \
28 |     --dataloader_num_workers 4 \
29 |     --task_type embedding \
30 |     --loss_type cosine_similarity \
31 |     --dataloader_drop_last true \
32 | 


--------------------------------------------------------------------------------
/examples/train/full/infer.sh:
--------------------------------------------------------------------------------
1 | # If you are using the validation set for inference, add the parameter `--load_data_args true`.
2 | CUDA_VISIBLE_DEVICES=0 \
3 | swift infer \
4 |     --model output/vx-xxx/checkpoint-xxx \
5 |     --stream true \
6 |     --temperature 0 \
7 |     --max_new_tokens 2048
8 | 


--------------------------------------------------------------------------------
/examples/train/full/qwen2_5_32b.sh:
--------------------------------------------------------------------------------
 1 | # 8 * 80GiB
 2 | NPROC_PER_NODE=8 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-32B \
 6 |     --train_type full \
 7 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
 8 |     --torch_dtype bfloat16 \
 9 |     --max_steps 2000 \
10 |     --streaming true \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 2 \
15 |     --packing true \
16 |     --eval_steps 200 \
17 |     --save_steps 200 \
18 |     --logging_steps 5 \
19 |     --max_length 8192 \
20 |     --warmup_ratio 0.05 \
21 |     --dataloader_num_workers 8 \
22 |     --dataset_num_proc 8 \
23 |     --save_total_limit 2 \
24 |     --save_only_model true \
25 |     --output_dir output/Qwen2.5-32B \
26 |     --deepspeed zero3 \
27 |     --use_liger_kernel true \
28 |     --attn_impl flash_attn
29 | 


--------------------------------------------------------------------------------
/examples/train/full/train.sh:
--------------------------------------------------------------------------------
 1 | # 76GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type full \
 6 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 7 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 8 |               'swift/self-cognition#500' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 16 \
15 |     --eval_steps 100 \
16 |     --save_steps 100 \
17 |     --save_total_limit 2 \
18 |     --logging_steps 5 \
19 |     --max_length 2048 \
20 |     --output_dir output \
21 |     --system 'You are a helpful assistant.' \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 4 \
24 |     --model_author swift \
25 |     --model_name swift-robot
26 | 


--------------------------------------------------------------------------------
/examples/train/grpo/internal/README.md:
--------------------------------------------------------------------------------
 1 | # README: GRPO Internal(Colocate) Mode Execution Scripts
 2 | 
 3 | ---
 4 | **NOTE**
 5 | The scripts in this folder require the source code version of ms-swift.
 6 | 
 7 | ```
 8 | git clone https://github.com/modelscope/ms-swift.git
 9 | cd ms-swift
10 | pip install -e .
11 | ```
12 | 
13 | ## **Introduction**
14 | 
15 | The GRPO (Group Relative Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows you to deploy vLLM and perform training using the same GPU resources.
16 | 
17 | This folder contains scripts and instructions for running GRPO in **Internal Mode**
18 | 
19 | ## Training with Internal mode
20 | ```bash
21 | --use_vllm true \
22 | --vllm_mode colocate \
23 | --vllm_gpu_memory_utilization [ut_ratio] \
24 | ```
25 | 
26 | ## Multi-Node Training
27 | On each node, execute the original single-node training script, using the environment variables `NNODES` and `NODE_RANK`, and ensure consistent use of configuration parameters across all nodes.
28 | 


--------------------------------------------------------------------------------
/examples/train/grpo/internal/full_lmdeploy.sh:
--------------------------------------------------------------------------------
1 | # The LMDeploy backend in GRPO has been deprecated in Swift 3.5.
2 | # You can install Swift 3.4 to continue using it with the following script:
3 | # https://github.com/modelscope/ms-swift/blob/v3.4.1/examples/train/grpo/internal/full_lmdeploy.sh
4 | 


--------------------------------------------------------------------------------
/examples/train/grpo/internal/vllm_vl7b.sh:
--------------------------------------------------------------------------------
 1 | MAX_PIXELS=1003520 \
 2 | NPROC_PER_NODE=8 \
 3 | swift rlhf \
 4 |     --rlhf_type grpo \
 5 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
 6 |     --train_type lora \
 7 |     --dataset AI-ModelScope/chartqa_digit_r1v_format \
 8 |     --use_vllm true \
 9 |     --vllm_mode colocate \
10 |     --vllm_gpu_memory_utilization 0.5 \
11 |     --vllm_tensor_parallel_size 4 \
12 |     --torch_dtype bfloat16 \
13 |     --system examples/train/grpo/prompt.txt \
14 |     --num_train_epochs 1 \
15 |     --per_device_train_batch_size 1 \
16 |     --per_device_eval_batch_size 1 \
17 |     --learning_rate 1e-6 \
18 |     --save_total_limit 2 \
19 |     --logging_steps 5 \
20 |     --output_dir output \
21 |     --gradient_accumulation_steps 1 \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 4 \
24 |     --max_completion_length 1024 \
25 |     --reward_funcs accuracy format \
26 |     --num_generations 8 \
27 |     --sleep_level 1 \
28 |     --temperature 1.0 \
29 |     --top_p 0.85
30 | 


--------------------------------------------------------------------------------
/examples/train/grpo/plugin/run_external_reward_model.sh:
--------------------------------------------------------------------------------
 1 | # see rm_plugin example in swift/plugin/rm_plugin.py
 2 | # register customized plugin in external_plugins file
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 5 | NPROC_PER_NODE=8 \
 6 | swift rlhf \
 7 |     --rlhf_type grpo \
 8 |     --model Qwen/Qwen2.5-7B \
 9 |     --dataset AI-MO/NuminaMath-TIR#5000 \
10 |     --use_vllm true \
11 |     --vllm_mode colocate \
12 |     --vllm_gpu_memory_utilization 0.5 \
13 |     --external_plugins examples/train/grpo/plugin/plugin.py \
14 |     --reward_funcs format \
15 |     --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \
16 |     --reward_model_plugin genrm my_rmplugin \
17 |     --reward_weights 0.1 1 1 \
18 |     --sleep_level 1 \
19 |     --offload_model true \
20 |     --offload_optimizer true \
21 |     --gc_collect_after_offload true \
22 |     --log_completions true \
23 |     --deepspeed zero2
24 | 


--------------------------------------------------------------------------------
/examples/train/grpo/prompt.txt:
--------------------------------------------------------------------------------
1 | A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>
2 | 


--------------------------------------------------------------------------------
/examples/train/grpo/qwen2_5_omni/infer.sh:
--------------------------------------------------------------------------------
1 | MAX_PIXELS=1003520 \
2 | CUDA_VISIBLE_DEVICES=0 \
3 | swift infer \
4 |     --adapters vx-xxx/checkpoint-xxx \
5 |     --load_data_args true \
6 |     --stream true \
7 |     --max_new_tokens 2048
8 | 


--------------------------------------------------------------------------------
/examples/train/infer.sh:
--------------------------------------------------------------------------------
1 | # If it's full parameter training, use `--model xxx` instead of `--adapters xxx`.
2 | # If you are using the validation set for inference, add the parameter `--load_data_args true`.
3 | CUDA_VISIBLE_DEVICES=0 \
4 | swift infer \
5 |     --adapters output/vx-xxx/checkpoint-xxx \
6 |     --stream true \
7 |     --temperature 0 \
8 |     --max_new_tokens 2048
9 | 


--------------------------------------------------------------------------------
/examples/train/lazy_tokenize/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --gradient_accumulation_steps 16 \
13 |     --eval_steps 100 \
14 |     --save_steps 100 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5 \
17 |     --lazy_tokenize true \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/examples/train/liger/sft.sh:
--------------------------------------------------------------------------------
 1 | # test env: 4 * A100
 2 | # Using use_liger_kernel and packing: 4 * 42GB, 1 hour 35 minutes
 3 | # Not using use_liger_kernel: 4 * 54GB, 1 hour 40 minutes
 4 | # Not using use_liger_kernel and packing: 4 * 52GB, 3 hours 30 minutes
 5 | 
 6 | NPROC_PER_NODE=4 \
 7 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 8 | swift sft \
 9 |     --model Qwen/Qwen2.5-7B \
10 |     --train_type full \
11 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT#10000' \
12 |     --torch_dtype bfloat16 \
13 |     --per_device_train_batch_size 1 \
14 |     --per_device_eval_batch_size 1 \
15 |     --learning_rate 1e-5 \
16 |     --num_train_epochs 5 \
17 |     --gradient_accumulation_steps 2 \
18 |     --eval_steps 100 \
19 |     --save_steps 100 \
20 |     --logging_steps 5 \
21 |     --max_length 8192 \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 8 \
24 |     --dataset_num_proc 8 \
25 |     --save_total_limit 2 \
26 |     --save_only_model true \
27 |     --output_dir output/Qwen2.5-7B \
28 |     --deepspeed zero3 \
29 |     --attn_impl flash_attn \
30 |     --packing true \
31 |     --use_liger_kernel true
32 | 


--------------------------------------------------------------------------------
/examples/train/long_text/liger_kernel.sh:
--------------------------------------------------------------------------------
 1 | # Env: 4 * A100
 2 | # https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/long_text.sh
 3 | # Max Length: 16K
 4 | # GPU Memory: 4 * 42GB, Training Speed 10s/it
 5 | NPROC_PER_NODE=4 \
 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 7 | swift sft \
 8 |     --model Qwen/Qwen2.5-7B \
 9 |     --train_type full \
10 |     --dataset 'AI-ModelScope/LongAlpaca-12k' \
11 |     --torch_dtype bfloat16 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-5 \
15 |     --gradient_accumulation_steps 2 \
16 |     --packing true \
17 |     --eval_steps 200 \
18 |     --save_steps 200 \
19 |     --logging_steps 5 \
20 |     --max_length 16384 \
21 |     --warmup_ratio 0.05 \
22 |     --dataloader_num_workers 8 \
23 |     --dataset_num_proc 8 \
24 |     --save_total_limit 2 \
25 |     --save_only_model true \
26 |     --output_dir output/Qwen2.5-7B \
27 |     --deepspeed zero3 \
28 |     --use_liger_kernel true \
29 |     --attn_impl flash_attn
30 | 


--------------------------------------------------------------------------------
/examples/train/long_text/sequence_parallel.sh:
--------------------------------------------------------------------------------
 1 | # Env: 4 * A100
 2 | # Max Length: 65536
 3 | # GPU Memory: 4 * 53GiB, Training Speed 50s/it
 4 | NPROC_PER_NODE=4 \
 5 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type full \
 9 |     --dataset 'AI-ModelScope/LongAlpaca-12k' \
10 |     --torch_dtype bfloat16 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 8 \
15 |     --packing true \
16 |     --rope_scaling yarn \
17 |     --max_length 65536 \
18 |     --eval_steps 200 \
19 |     --save_steps 200 \
20 |     --logging_steps 5 \
21 |     --warmup_ratio 0.05 \
22 |     --dataloader_num_workers 8 \
23 |     --dataset_num_proc 8 \
24 |     --save_total_limit 2 \
25 |     --save_only_model true \
26 |     --output_dir output/Qwen2.5-7B-Instruct \
27 |     --deepspeed zero3 \
28 |     --attn_impl flash_attn \
29 |     --sequence_parallel_size 4
30 | 


--------------------------------------------------------------------------------
/examples/train/long_text/sequence_parallel_512k.sh:
--------------------------------------------------------------------------------
 1 | # Env: 8 * A100
 2 | # Max Length: 512000
 3 | # GPU Memory: 8 * 80GiB, Training Speed 150s/it
 4 | NPROC_PER_NODE=8 \
 5 | CELOSS_PARALLEL_SIZE=2048 \
 6 | swift sft \
 7 |     --model Qwen/QwQ-32B \
 8 |     --train_type lora \
 9 |     --dataset 'AI-ModelScope/LongAlpaca-12k' \
10 |     --torch_dtype bfloat16 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 2 \
15 |     --packing true \
16 |     --rope_scaling yarn \
17 |     --max_length 512000 \
18 |     --eval_steps 200 \
19 |     --save_steps 200 \
20 |     --logging_steps 5 \
21 |     --warmup_ratio 0.05 \
22 |     --dataloader_num_workers 8 \
23 |     --dataset_num_proc 8 \
24 |     --save_total_limit 2 \
25 |     --use_liger_kernel true \
26 |     --save_only_model true \
27 |     --deepspeed zero3_offload \
28 |     --attn_impl flash_attn \
29 |     --sequence_parallel_size 8
30 | 


--------------------------------------------------------------------------------
/examples/train/long_text/sequence_parallel_dpo.sh:
--------------------------------------------------------------------------------
 1 | # Env: 4 * A100
 2 | # GPU Memory: 4 * 25GiB, Training Speed 14s/it
 3 | NPROC_PER_NODE=4 \
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | swift rlhf \
 6 |     --rlhf_type dpo \
 7 |     --model Qwen/Qwen2.5-VL-3B-Instruct \
 8 |     --train_type full \
 9 |     --dataset swift/RLAIF-V-Dataset \
10 |     --torch_dtype bfloat16 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 8 \
15 |     --eval_steps 200 \
16 |     --save_steps 200 \
17 |     --logging_steps 5 \
18 |     --warmup_ratio 0.05 \
19 |     --dataloader_num_workers 8 \
20 |     --dataset_num_proc 8 \
21 |     --save_total_limit 2 \
22 |     --save_only_model true \
23 |     --output_dir output/Qwen2.5-VL-3B-Instruct \
24 |     --deepspeed zero3 \
25 |     --attn_impl flash_attn \
26 |     --use_liger_kernel true \
27 |     --sequence_parallel_size 4
28 | 


--------------------------------------------------------------------------------
/examples/train/lora_sft.sh:
--------------------------------------------------------------------------------
 1 | # 22GB
 2 | # qwen3: https://github.com/modelscope/ms-swift/blob/main/examples/train/think_model/qwen3_demo1.sh
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type lora \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 9 |               'swift/self-cognition#500' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps 16 \
19 |     --eval_steps 50 \
20 |     --save_steps 50 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --system 'You are a helpful assistant.' \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --model_author swift \
29 |     --model_name swift-robot
30 | 


--------------------------------------------------------------------------------
/examples/train/megatron/base_to_chat.sh:
--------------------------------------------------------------------------------
 1 | # 8 * 65GiB
 2 | NPROC_PER_NODE=8 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 4 | megatron sft \
 5 |     --load Qwen2.5-14B-mcore \
 6 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
 7 |     --tensor_model_parallel_size 4 \
 8 |     --micro_batch_size 1 \
 9 |     --global_batch_size 16 \
10 |     --packing true \
11 |     --recompute_granularity selective \
12 |     --train_iters 2000 \
13 |     --eval_iters 50 \
14 |     --finetune true \
15 |     --cross_entropy_loss_fusion true \
16 |     --lr 1e-5 \
17 |     --lr_warmup_iters 100 \
18 |     --min_lr 1e-6 \
19 |     --save megatron_output/Qwen2.5-14B \
20 |     --eval_interval 200 \
21 |     --save_interval 200 \
22 |     --max_length 8192 \
23 |     --num_workers 8 \
24 |     --dataset_num_proc 8 \
25 |     --no_save_optim true \
26 |     --no_save_rng true \
27 |     --sequence_parallel true \
28 |     --use_flash_attn true
29 | 


--------------------------------------------------------------------------------
/examples/train/megatron/benchmark/deepspeed.sh:
--------------------------------------------------------------------------------
 1 | # 8 * 80GiB
 2 | # Corresponding Megatron-SWIFT script reference:
 3 | # https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron/base_to_chat.sh
 4 | NPROC_PER_NODE=8 \
 5 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-14B \
 8 |     --train_type full \
 9 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
10 |     --torch_dtype bfloat16 \
11 |     --max_steps 2000 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-5 \
15 |     --gradient_accumulation_steps 2 \
16 |     --packing true \
17 |     --eval_steps 200 \
18 |     --save_steps 200 \
19 |     --logging_steps 5 \
20 |     --max_length 8192 \
21 |     --warmup_ratio 0.05 \
22 |     --dataloader_num_workers 8 \
23 |     --dataset_num_proc 8 \
24 |     --save_total_limit -1 \
25 |     --save_only_model true \
26 |     --output_dir output/Qwen2.5-14B \
27 |     --deepspeed zero2 \
28 |     --attn_impl flash_attn
29 | 


--------------------------------------------------------------------------------
/examples/train/megatron/long_text.sh:
--------------------------------------------------------------------------------
 1 | # Env: 4 * A100
 2 | # https://github.com/modelscope/ms-swift/blob/main/examples/train/long_text/zero3.sh
 3 | # Max Length: 32K
 4 | # GPU Memory: 4 * 50GB, Training Speed 23s/it
 5 | NPROC_PER_NODE=4 \
 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 7 | megatron sft \
 8 |     --load Qwen2.5-7B-mcore \
 9 |     --dataset 'ZhipuAI/LongWriter-6k' \
10 |     --tensor_model_parallel_size 4 \
11 |     --micro_batch_size 1 \
12 |     --global_batch_size 8 \
13 |     --packing true \
14 |     --recompute_granularity full \
15 |     --recompute_method uniform \
16 |     --recompute_num_layers 1 \
17 |     --train_iters 1000 \
18 |     --eval_iters 50 \
19 |     --finetune true \
20 |     --cross_entropy_loss_fusion true \
21 |     --lr 1e-5 \
22 |     --lr_warmup_iters 100 \
23 |     --min_lr 1e-6 \
24 |     --save megatron_output/Qwen2.5-7B \
25 |     --eval_interval 200 \
26 |     --save_interval 200 \
27 |     --max_length 32768 \
28 |     --num_workers 8 \
29 |     --dataset_num_proc 8 \
30 |     --no_save_optim true \
31 |     --no_save_rng true \
32 |     --sequence_parallel true \
33 |     --use_flash_attn true
34 | 


--------------------------------------------------------------------------------
/examples/train/megatron/moe.sh:
--------------------------------------------------------------------------------
 1 | # 8 * 65GiB
 2 | NPROC_PER_NODE=8 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 4 | megatron sft \
 5 |     --load Qwen1.5-MoE-A2.7B-mcore \
 6 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
 7 |     --tensor_model_parallel_size 2 \
 8 |     --expert_model_parallel_size 4 \
 9 |     --moe_grouped_gemm true \
10 |     --moe_shared_expert_overlap true \
11 |     --moe_aux_loss_coeff 0.01 \
12 |     --micro_batch_size 1 \
13 |     --global_batch_size 16 \
14 |     --packing true \
15 |     --recompute_granularity selective \
16 |     --train_iters 2000 \
17 |     --eval_iters 50 \
18 |     --finetune true \
19 |     --cross_entropy_loss_fusion true \
20 |     --lr 1e-5 \
21 |     --lr_warmup_iters 100 \
22 |     --min_lr 1e-6 \
23 |     --save megatron_output/Qwen1.5-MoE-A2.7B \
24 |     --eval_interval 200 \
25 |     --save_interval 200 \
26 |     --max_length 8192 \
27 |     --num_workers 8 \
28 |     --dataset_num_proc 8 \
29 |     --no_save_optim true \
30 |     --no_save_rng true \
31 |     --sequence_parallel true \
32 |     --use_flash_attn true
33 | 


--------------------------------------------------------------------------------
/examples/train/megatron/multi-node/node1.sh:
--------------------------------------------------------------------------------
 1 | # For more information on multi-node training launch methods, refer to:
 2 | # https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | NNODES=2 \
 6 | NODE_RANK=0 \
 7 | MASTER_ADDR=127.0.0.1 \
 8 | MASTER_PORT=29500 \
 9 | NPROC_PER_NODE=4 \
10 | megatron sft \
11 |     --load Qwen2.5-14B-mcore \
12 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
13 |     --tensor_model_parallel_size 4 \
14 |     --micro_batch_size 1 \
15 |     --global_batch_size 16 \
16 |     --packing true \
17 |     --recompute_granularity selective \
18 |     --train_iters 2000 \
19 |     --eval_iters 50 \
20 |     --finetune true \
21 |     --cross_entropy_loss_fusion true \
22 |     --lr 1e-5 \
23 |     --lr_warmup_iters 100 \
24 |     --min_lr 1e-6 \
25 |     --save megatron_output/Qwen2.5-14B \
26 |     --eval_interval 200 \
27 |     --save_interval 200 \
28 |     --max_length 8192 \
29 |     --num_workers 8 \
30 |     --dataset_num_proc 8 \
31 |     --no_save_optim true \
32 |     --no_save_rng true \
33 |     --sequence_parallel true \
34 |     --use_flash_attn true
35 | 


--------------------------------------------------------------------------------
/examples/train/megatron/multi-node/node2.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 2 | NNODES=2 \
 3 | NODE_RANK=1 \
 4 | MASTER_ADDR=xxx.xxx.xxx.xxx \
 5 | MASTER_PORT=29500 \
 6 | NPROC_PER_NODE=4 \
 7 | megatron sft \
 8 |     --load Qwen2.5-14B-mcore \
 9 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
10 |     --tensor_model_parallel_size 4 \
11 |     --micro_batch_size 1 \
12 |     --global_batch_size 16 \
13 |     --packing true \
14 |     --recompute_granularity selective \
15 |     --train_iters 2000 \
16 |     --eval_iters 50 \
17 |     --finetune true \
18 |     --cross_entropy_loss_fusion true \
19 |     --lr 1e-5 \
20 |     --lr_warmup_iters 100 \
21 |     --min_lr 1e-6 \
22 |     --save megatron_output/Qwen2.5-14B \
23 |     --eval_interval 200 \
24 |     --save_interval 200 \
25 |     --max_length 8192 \
26 |     --num_workers 8 \
27 |     --dataset_num_proc 8 \
28 |     --no_save_optim true \
29 |     --no_save_rng true \
30 |     --sequence_parallel true \
31 |     --use_flash_attn true
32 | 


--------------------------------------------------------------------------------
/examples/train/megatron/pretrain.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 80GiB
 2 | NPROC_PER_NODE=4 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 4 | megatron pt \
 5 |     --load Qwen2.5-7B-mcore \
 6 |     --dataset swift/chinese-c4 \
 7 |     --streaming true \
 8 |     --packing true \
 9 |     --tensor_model_parallel_size 4 \
10 |     --micro_batch_size 1 \
11 |     --global_batch_size 16 \
12 |     --recompute_granularity selective \
13 |     --train_iters 10000 \
14 |     --eval_iters 100 \
15 |     --finetune true \
16 |     --cross_entropy_loss_fusion true \
17 |     --lr 1e-5 \
18 |     --lr_warmup_iters 300 \
19 |     --min_lr 1e-6 \
20 |     --save megatron_output/Qwen2.5-7B \
21 |     --eval_interval 500 \
22 |     --save_interval 500 \
23 |     --max_length 8192 \
24 |     --num_workers 4 \
25 |     --dataset_num_proc 8 \
26 |     --no_save_optim true \
27 |     --no_save_rng true \
28 |     --sequence_parallel true \
29 |     --use_flash_attn true
30 | 


--------------------------------------------------------------------------------
/examples/train/megatron/qwen3_32b.sh:
--------------------------------------------------------------------------------
 1 | # 8 * 80GiB
 2 | PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 3 | NPROC_PER_NODE=8 \
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 5 | megatron sft \
 6 |     --load Qwen3-32B-mcore \
 7 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
 8 |     --tensor_model_parallel_size 8 \
 9 |     --micro_batch_size 1 \
10 |     --global_batch_size 16 \
11 |     --packing true \
12 |     --recompute_granularity full \
13 |     --recompute_method uniform \
14 |     --recompute_num_layers 1 \
15 |     --train_iters 10000 \
16 |     --max_epochs 5 \
17 |     --eval_iters 50 \
18 |     --finetune true \
19 |     --cross_entropy_loss_fusion true \
20 |     --lr 1e-5 \
21 |     --lr_warmup_iters 100 \
22 |     --min_lr 1e-6 \
23 |     --save megatron_output/Qwen3-32B \
24 |     --eval_interval 500 \
25 |     --save_interval 500 \
26 |     --max_length 8192 \
27 |     --num_workers 8 \
28 |     --dataset_num_proc 8 \
29 |     --no_save_optim true \
30 |     --no_save_rng true \
31 |     --sequence_parallel true \
32 |     --attention_backend flash
33 | 


--------------------------------------------------------------------------------
/examples/train/megatron/sft.sh:
--------------------------------------------------------------------------------
 1 | # 2 * 80GiB
 2 | NPROC_PER_NODE=2 \
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | megatron sft \
 5 |     --load Qwen2.5-7B-Instruct-mcore \
 6 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 7 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 8 |               'swift/self-cognition#500' \
 9 |     --tensor_model_parallel_size 2 \
10 |     --micro_batch_size 4 \
11 |     --global_batch_size 16 \
12 |     --recompute_granularity selective \
13 |     --train_iters 100 \
14 |     --eval_iters 5 \
15 |     --finetune true \
16 |     --cross_entropy_loss_fusion true \
17 |     --lr 1e-5 \
18 |     --lr_warmup_iters 10 \
19 |     --min_lr 1e-6 \
20 |     --save megatron_output/Qwen2.5-7B-Instruct \
21 |     --save_interval 100 \
22 |     --max_length 2048 \
23 |     --system 'You are a helpful assistant.' \
24 |     --num_workers 4 \
25 |     --no_save_optim true \
26 |     --no_save_rng true \
27 |     --dataset_num_proc 4 \
28 |     --model_author swift \
29 |     --model_name swift-robot
30 | 


--------------------------------------------------------------------------------
/examples/train/moe/llama4.sh:
--------------------------------------------------------------------------------
 1 | # Manually select `target_modules` to avoid 'all-linear' selecting 'router'
 2 | NPROC_PER_NODE=4 \
 3 | USE_HF=1 \
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | swift sft \
 6 |     --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
 7 |     --dataset 'linxy/LaTeX_OCR:full#5000' \
 8 |     --train_type lora \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_regex '^(language_model).*\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$' \
17 |     --freeze_vit true \
18 |     --gradient_accumulation_steps 4 \
19 |     --gradient_checkpointing true \
20 |     --eval_steps 50 \
21 |     --save_steps 50 \
22 |     --save_total_limit 2 \
23 |     --logging_steps 5 \
24 |     --max_length 2048 \
25 |     --output_dir output \
26 |     --warmup_ratio 0.05 \
27 |     --deepspeed zero3 \
28 |     --dataloader_num_workers 4
29 | 


--------------------------------------------------------------------------------
/examples/train/moe/qwen2_5_moe.sh:
--------------------------------------------------------------------------------
 1 | # Manually select `target_modules` to avoid 'all-linear' selecting 'gate'
 2 | CUDA_VISIBLE_DEVICES=0,1 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2-57B-A14B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 7 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 8 |               'swift/self-cognition#500' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 50 \
19 |     --save_steps 50 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --system 'You are a helpful assistant.' \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --model_author swift \
28 |     --model_name swift-robot
29 | 


--------------------------------------------------------------------------------
/examples/train/multi-gpu/ddp/train.sh:
--------------------------------------------------------------------------------
 1 | # 27.5GiB * 2
 2 | nproc_per_node=2
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1 \
 5 | NPROC_PER_NODE=$nproc_per_node \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --torch_dtype bfloat16 \
10 |     --dataset 'swift/self-cognition#1000' \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --system 'You are a helpful assistant.' \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --model_author swift \
29 |     --model_name swift-robot \
30 |     --gradient_checkpointing_kwargs '{"use_reentrant": false}'
31 | 


--------------------------------------------------------------------------------
/examples/train/multi-gpu/ddp_device_map/train.sh:
--------------------------------------------------------------------------------
 1 | # 14GiB * 4
 2 | nproc_per_node=2
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | NPROC_PER_NODE=$nproc_per_node \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset 'swift/self-cognition#1000' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --system 'You are a helpful assistant.' \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --model_author swift \
29 |     --model_name swift-robot \
30 |     --gradient_checkpointing_kwargs '{"use_reentrant": false}'
31 | 


--------------------------------------------------------------------------------
/examples/train/multi-gpu/deepspeed/train_zero2.sh:
--------------------------------------------------------------------------------
 1 | # 18GiB * 2
 2 | nproc_per_node=2
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1 \
 5 | NPROC_PER_NODE=$nproc_per_node \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset 'swift/self-cognition#1000' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --system 'You are a helpful assistant.' \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --model_author swift \
29 |     --model_name swift-robot \
30 |     --deepspeed zero2
31 | 


--------------------------------------------------------------------------------
/examples/train/multi-gpu/deepspeed/train_zero3.sh:
--------------------------------------------------------------------------------
 1 | # 16GiB * 2
 2 | nproc_per_node=2
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1 \
 5 | NPROC_PER_NODE=$nproc_per_node \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset 'swift/self-cognition#1000' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --system 'You are a helpful assistant.' \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --model_author swift \
29 |     --model_name swift-robot \
30 |     --deepspeed zero3
31 | 


--------------------------------------------------------------------------------
/examples/train/multi-gpu/device_map/train.sh:
--------------------------------------------------------------------------------
 1 | # 2 * 76GiB
 2 | CUDA_VISIBLE_DEVICES=0,1 \
 3 | MAX_PIXELS=1003520 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-VL-72B-Instruct \
 6 |     --dataset 'modelscope/coco_2014_caption:validation#20000' \
 7 |     --train_type lora \
 8 |     --torch_dtype bfloat16 \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 1 \
12 |     --learning_rate 1e-4 \
13 |     --lora_rank 8 \
14 |     --lora_alpha 32 \
15 |     --target_modules all-linear \
16 |     --freeze_vit true \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 100 \
19 |     --save_steps 100 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4
26 | 


--------------------------------------------------------------------------------
/examples/train/multi-gpu/fsdp_qlora/fsdp_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compute_environment": "LOCAL_MACHINE",
 3 |   "debug": false,
 4 |   "distributed_type": "FSDP",
 5 |   "downcast_bf16": "no",
 6 |   "fsdp_config": {
 7 |     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
 8 |     "fsdp_backward_prefetch": "BACKWARD_PRE",
 9 |     "fsdp_cpu_ram_efficient_loading": true,
10 |     "fsdp_forward_prefetch": false,
11 |     "fsdp_offload_params": true,
12 |     "fsdp_sharding_strategy": "FULL_SHARD",
13 |     "fsdp_state_dict_type": "FULL_STATE_DICT",
14 |     "fsdp_sync_module_states": true,
15 |     "fsdp_use_orig_params": false
16 |   },
17 |   "machine_rank": 0,
18 |   "main_training_function": "main",
19 |   "mixed_precision": "no",
20 |   "num_machines": 1,
21 |   "num_processes": 2,
22 |   "rdzv_backend": "static",
23 |   "same_network": true,
24 |   "tpu_env": [],
25 |   "tpu_use_cluster": false,
26 |   "tpu_use_sudo": false,
27 |   "use_cpu": false
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/accelerate/multi_node.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |     deepspeed_multinode_launcher: standard
 4 |     gradient_accumulation_steps: 16
 5 |     offload_optimizer_device: none
 6 |     offload_param_device: none
 7 |     zero3_init_flag: false
 8 |     zero_stage: 3
 9 | distributed_type: DEEPSPEED
10 | main_process_ip: 'xxx.xxx.xxx.xxx'
11 | main_process_port: 29500
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 2
15 | num_processes: 8  # world size
16 | rdzv_backend: static
17 | use_cpu: false
18 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/accelerate/train_node1.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 2 | accelerate launch --config_file ./examples/train/multi-node/accelerate/multi_node.yaml --machine_rank 0 \
 3 |     swift/cli/sft.py \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --torch_dtype bfloat16 \
 7 |     --dataset 'swift/self-cognition#1000' \
 8 |     --num_train_epochs 1 \
 9 |     --lora_rank 8 \
10 |     --lora_alpha 32 \
11 |     --learning_rate 1e-4 \
12 |     --gradient_accumulation_steps 16 \
13 |     --eval_steps 100 \
14 |     --save_steps 100 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5 \
17 |     --model_author swift \
18 |     --model_name swift-robot
19 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/accelerate/train_node2.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 2 | accelerate launch --config_file ./examples/train/multi-node/accelerate/multi_node.yaml --machine_rank 1 \
 3 |     swift/cli/sft.py \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --torch_dtype bfloat16 \
 7 |     --dataset 'swift/self-cognition#1000' \
 8 |     --num_train_epochs 1 \
 9 |     --lora_rank 8 \
10 |     --lora_alpha 32 \
11 |     --learning_rate 1e-4 \
12 |     --gradient_accumulation_steps 16 \
13 |     --eval_steps 100 \
14 |     --save_steps 100 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5 \
17 |     --model_author swift \
18 |     --model_name swift-robot
19 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/deepspeed/host.txt:
--------------------------------------------------------------------------------
1 | worker-0 slots=2
2 | worker-1 slots=2
3 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/deepspeed/train.sh:
--------------------------------------------------------------------------------
 1 | # If your need only a part of the GPUs in every node, try:
 2 | # --include="worker-0:0,1@worker-1:2,3"
 3 | deepspeed --hostfile=./examples/train/multi-node-deepspeed/host.txt \
 4 |     swift/cli/sft.py \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type lora \
 7 |     --torch_dtype bfloat16 \
 8 |     --dataset 'swift/self-cognition#1000' \
 9 |     --num_train_epochs 1 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --learning_rate 1e-4 \
13 |     --gradient_accumulation_steps 16 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/dlc/train.sh:
--------------------------------------------------------------------------------
 1 | # https://help.aliyun.com/zh/pai/user-guide/general-environment-variables
 2 | NNODES=$WORLD_SIZE \
 3 | NODE_RANK=$RANK \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type full \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#20000' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 4 \
15 |     --eval_steps 100 \
16 |     --save_steps 100 \
17 |     --save_total_limit 2 \
18 |     --logging_steps 5 \
19 |     --max_length 8192 \
20 |     --output_dir output \
21 |     --system 'You are a helpful assistant.' \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 4 \
24 |     --deepspeed zero2
25 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/swift/train_node1.sh:
--------------------------------------------------------------------------------
 1 | nnodes=2
 2 | nproc_per_node=4
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | NNODES=$nnodes \
 6 | NODE_RANK=0 \
 7 | MASTER_ADDR=127.0.0.1 \
 8 | MASTER_PORT=29500 \
 9 | NPROC_PER_NODE=$nproc_per_node \
10 | swift sft \
11 |     --model Qwen/Qwen2.5-7B-Instruct \
12 |     --train_type full \
13 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \
14 |               'AI-ModelScope/alpaca-gpt4-data-en#20000' \
15 |     --torch_dtype bfloat16 \
16 |     --num_train_epochs 1 \
17 |     --per_device_train_batch_size 1 \
18 |     --per_device_eval_batch_size 1 \
19 |     --learning_rate 1e-5 \
20 |     --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \
21 |     --eval_steps 100 \
22 |     --save_steps 100 \
23 |     --save_total_limit 2 \
24 |     --logging_steps 5 \
25 |     --max_length 8192 \
26 |     --output_dir output \
27 |     --system 'You are a helpful assistant.' \
28 |     --warmup_ratio 0.05 \
29 |     --dataloader_num_workers 4 \
30 |     --deepspeed zero2
31 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/swift/train_node2.sh:
--------------------------------------------------------------------------------
 1 | nnodes=2
 2 | nproc_per_node=4
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | NNODES=$nnodes \
 6 | NODE_RANK=1 \
 7 | MASTER_ADDR=xxx.xxx.xxx.xxx \
 8 | MASTER_PORT=29500 \
 9 | NPROC_PER_NODE=$nproc_per_node \
10 | swift sft \
11 |     --model Qwen/Qwen2.5-7B-Instruct \
12 |     --train_type full \
13 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \
14 |               'AI-ModelScope/alpaca-gpt4-data-en#20000' \
15 |     --torch_dtype bfloat16 \
16 |     --num_train_epochs 1 \
17 |     --per_device_train_batch_size 1 \
18 |     --per_device_eval_batch_size 1 \
19 |     --learning_rate 1e-5 \
20 |     --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \
21 |     --eval_steps 100 \
22 |     --save_steps 100 \
23 |     --save_total_limit 2 \
24 |     --logging_steps 5 \
25 |     --max_length 8192 \
26 |     --output_dir output \
27 |     --system 'You are a helpful assistant.' \
28 |     --warmup_ratio 0.05 \
29 |     --dataloader_num_workers 4 \
30 |     --deepspeed zero2
31 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/torchrun/train_node1.sh:
--------------------------------------------------------------------------------
 1 | nnodes=2
 2 | nproc_per_node=4
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | torchrun \
 6 |     --master_port 29500 \
 7 |     --nproc_per_node=$nproc_per_node \
 8 |     --nnodes=$nnodes \
 9 |     --node_rank=0 \
10 |     --master_addr=127.0.0.1 \
11 |     swift/cli/sft.py \
12 |     --model Qwen/Qwen2.5-7B-Instruct \
13 |     --train_type full \
14 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \
15 |               'AI-ModelScope/alpaca-gpt4-data-en#20000' \
16 |     --torch_dtype bfloat16 \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 1 \
19 |     --per_device_eval_batch_size 1 \
20 |     --learning_rate 1e-5 \
21 |     --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \
22 |     --eval_steps 100 \
23 |     --save_steps 100 \
24 |     --save_total_limit 2 \
25 |     --logging_steps 5 \
26 |     --max_length 8192 \
27 |     --output_dir output \
28 |     --system 'You are a helpful assistant.' \
29 |     --warmup_ratio 0.05 \
30 |     --dataloader_num_workers 4 \
31 |     --deepspeed zero2
32 | 


--------------------------------------------------------------------------------
/examples/train/multi-node/torchrun/train_node2.sh:
--------------------------------------------------------------------------------
 1 | nnodes=2
 2 | nproc_per_node=4
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | torchrun \
 6 |     --master_port 29500 \
 7 |     --nproc_per_node=$nproc_per_node \
 8 |     --nnodes=$nnodes \
 9 |     --node_rank=1 \
10 |     --master_addr=xxx.xxx.xxx.xxx \
11 |     swift/cli/sft.py \
12 |     --model Qwen/Qwen2.5-7B-Instruct \
13 |     --train_type full \
14 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' \
15 |               'AI-ModelScope/alpaca-gpt4-data-en#20000' \
16 |     --torch_dtype bfloat16 \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 1 \
19 |     --per_device_eval_batch_size 1 \
20 |     --learning_rate 1e-5 \
21 |     --gradient_accumulation_steps $(expr 32 / $nproc_per_node / $nnodes) \
22 |     --eval_steps 100 \
23 |     --save_steps 100 \
24 |     --save_total_limit 2 \
25 |     --logging_steps 5 \
26 |     --max_length 8192 \
27 |     --output_dir output \
28 |     --system 'You are a helpful assistant.' \
29 |     --warmup_ratio 0.05 \
30 |     --dataloader_num_workers 4 \
31 |     --deepspeed zero2
32 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/audio.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |     --model Qwen/Qwen2-Audio-7B-Instruct \
 4 |     --dataset 'speech_asr/speech_asr_aishell1_trainsets:validation#20000' \
 5 |     --train_type lora \
 6 |     --torch_dtype bfloat16 \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --per_device_eval_batch_size 1 \
10 |     --learning_rate 1e-4 \
11 |     --lora_rank 8 \
12 |     --lora_alpha 32 \
13 |     --target_modules all-linear \
14 |     --freeze_vit true \
15 |     --gradient_accumulation_steps 16 \
16 |     --eval_steps 100 \
17 |     --save_steps 100 \
18 |     --save_total_limit 2 \
19 |     --logging_steps 5 \
20 |     --max_length 2048 \
21 |     --output_dir output \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 4
24 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/caption.sh:
--------------------------------------------------------------------------------
 1 | # 22GiB
 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 3 | # 1003520 = 1280 * 28 * 28
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | MAX_PIXELS=1003520 \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
 8 |     --dataset 'modelscope/coco_2014_caption:validation#20000' \
 9 |     --train_type lora \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --freeze_vit true \
19 |     --gradient_accumulation_steps 16 \
20 |     --eval_steps 100 \
21 |     --save_steps 100 \
22 |     --save_total_limit 2 \
23 |     --logging_steps 5 \
24 |     --max_length 2048 \
25 |     --output_dir output \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4
28 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/grounding.sh:
--------------------------------------------------------------------------------
 1 | # 20GiB
 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | MAX_PIXELS=1003520 \
 5 | swift sft \
 6 |     --model Qwen/Qwen2-VL-7B-Instruct \
 7 |     --dataset 'AI-ModelScope/coco#20000' \
 8 |     --train_type lora \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --freeze_vit true \
18 |     --gradient_accumulation_steps 16 \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --dataset_num_proc 4
28 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/infer.sh:
--------------------------------------------------------------------------------
1 | # Perform inference using the validation set from the training phase.
2 | CUDA_VISIBLE_DEVICES=0 \
3 | MAX_PIXELS=1003520 \
4 | swift infer \
5 |     --adapters output/vx-xxx/checkpoint-xxx \
6 |     --stream true \
7 |     --load_data_args true \
8 |     --max_new_tokens 2048
9 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/lora_llm_full_vit/infer.sh:
--------------------------------------------------------------------------------
1 | # If the weights have been merged, please use `--model`.
2 | CUDA_VISIBLE_DEVICES=0 \
3 | swift infer \
4 |     --adapters output/vx-xxx/checkpoint-xxx \
5 |     --stream true \
6 |     --load_data_args true \
7 |     --temperature 0 \
8 |     --max_new_tokens 2048
9 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/lora_llm_full_vit/merge_lora.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift export \
 3 |     --adapters output/vx-xxx/checkpoint-xxx \
 4 |     --merge_lora true
 5 | 
 6 | # CUDA_VISIBLE_DEVICES=0 \
 7 | # swift infer \
 8 | #     --model output/vx-xxx/checkpoint-xxx-merged \
 9 | #     --stream true \
10 | #     --load_data_args true \
11 | #     --temperature 0 \
12 | #     --max_new_tokens 2048
13 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/lora_llm_full_vit/sft.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 22GiB
 2 | # vit/merger lr 1e-5; llm lora lr 1e-4
 3 | NPROC_PER_NODE=4 \
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 5 | MAX_PIXELS=1003520 \
 6 | swift sft \
 7 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
 8 |     --dataset 'AI-ModelScope/coco#20000' \
 9 |     --train_type custom \
10 |     --external_plugins 'examples/train/multimodal/lora_llm_full_vit/custom_plugin.py' \
11 |     --torch_dtype bfloat16 \
12 |     --num_train_epochs 1 \
13 |     --per_device_train_batch_size 1 \
14 |     --per_device_eval_batch_size 1 \
15 |     --learning_rate 1e-4 \
16 |     --vit_lr 1e-5 \
17 |     --aligner_lr 1e-5 \
18 |     --lora_rank 16 \
19 |     --lora_alpha 32 \
20 |     --gradient_accumulation_steps 4 \
21 |     --eval_steps 100 \
22 |     --save_steps 100 \
23 |     --save_total_limit 2 \
24 |     --logging_steps 5 \
25 |     --max_length 8192 \
26 |     --output_dir output \
27 |     --warmup_ratio 0.05 \
28 |     --dataloader_num_workers 4 \
29 |     --dataset_num_proc 4 \
30 |     --deepspeed zero2 \
31 |     --save_only_model true
32 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/ocr.sh:
--------------------------------------------------------------------------------
 1 | # 20GB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | MAX_PIXELS=1003520 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
 6 |     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
 7 |     --train_type lora \
 8 |     --torch_dtype bfloat16 \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 1 \
12 |     --learning_rate 1e-4 \
13 |     --lora_rank 8 \
14 |     --lora_alpha 32 \
15 |     --target_modules all-linear \
16 |     --freeze_vit true \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 50 \
19 |     --save_steps 50 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4
26 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/omni/infer.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | VIDEO_MAX_PIXELS=50176 \
 3 | FPS_MAX_FRAMES=12 \
 4 | MAX_PIXELS=1003520 \
 5 | ENABLE_AUDIO_OUTPUT=0 \
 6 | swift infer \
 7 |     --adapters output/vx-xxx/checkpoint-xxx \
 8 |     --stream true \
 9 |     --load_data_args true \
10 |     --max_new_tokens 2048
11 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/rlhf/dpo/full.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 50GiB
 2 | nproc_per_node=4
 3 | 
 4 | PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 5 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 6 | NPROC_PER_NODE=$nproc_per_node \
 7 | MAX_PIXELS=1003520 \
 8 | swift rlhf \
 9 |     --rlhf_type dpo \
10 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
11 |     --dataset 'swift/RLAIF-V-Dataset#20000' \
12 |     --train_type full \
13 |     --torch_dtype bfloat16 \
14 |     --num_train_epochs 1 \
15 |     --per_device_train_batch_size 1 \
16 |     --per_device_eval_batch_size 1 \
17 |     --learning_rate 1e-5 \
18 |     --freeze_vit true \
19 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
20 |     --eval_steps 100 \
21 |     --save_steps 100 \
22 |     --save_total_limit 2 \
23 |     --deepspeed zero3 \
24 |     --logging_steps 5 \
25 |     --max_length 4096 \
26 |     --output_dir output \
27 |     --warmup_ratio 0.05 \
28 |     --dataloader_num_workers 4 \
29 |     --dataset_num_proc 4 \
30 |     --save_only_model true
31 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/rlhf/dpo/lora.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 50GiB
 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 3 | # --rlhf_type cpo/orpo/simpo/rm are also supported
 4 | nproc_per_node=2
 5 | 
 6 | CUDA_VISIBLE_DEVICES=0,1 \
 7 | NPROC_PER_NODE=$nproc_per_node \
 8 | MAX_PIXELS=1003520 \
 9 | swift rlhf \
10 |     --rlhf_type dpo \
11 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
12 |     --dataset 'swift/RLAIF-V-Dataset#20000' \
13 |     --train_type lora \
14 |     --torch_dtype bfloat16 \
15 |     --num_train_epochs 1 \
16 |     --per_device_train_batch_size 1 \
17 |     --per_device_eval_batch_size 1 \
18 |     --learning_rate 1e-4 \
19 |     --lora_rank 8 \
20 |     --lora_alpha 32 \
21 |     --target_modules all-linear \
22 |     --freeze_vit true \
23 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
24 |     --eval_steps 100 \
25 |     --save_steps 100 \
26 |     --save_total_limit 2 \
27 |     --deepspeed zero2 \
28 |     --logging_steps 5 \
29 |     --max_length 4096 \
30 |     --output_dir output \
31 |     --warmup_ratio 0.05 \
32 |     --dataloader_num_workers 4 \
33 |     --dataset_num_proc 4
34 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/rlhf/kto.sh:
--------------------------------------------------------------------------------
 1 | # Due to the absence of a multi-modal open-source dataset for kto,
 2 | # we will use a pure text kto dataset as an example here.
 3 | nproc_per_node=2
 4 | 
 5 | CUDA_VISIBLE_DEVICES=0,1 \
 6 | NPROC_PER_NODE=$nproc_per_node \
 7 | MAX_PIXELS=1003520 \
 8 | swift rlhf \
 9 |     --rlhf_type kto \
10 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
11 |     --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \
12 |     --train_type lora \
13 |     --torch_dtype bfloat16 \
14 |     --num_train_epochs 1 \
15 |     --per_device_train_batch_size 1 \
16 |     --per_device_eval_batch_size 1 \
17 |     --learning_rate 1e-4 \
18 |     --lora_rank 8 \
19 |     --lora_alpha 32 \
20 |     --target_modules all-linear \
21 |     --freeze_vit true \
22 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
23 |     --eval_steps 100 \
24 |     --save_steps 100 \
25 |     --save_total_limit 2 \
26 |     --deepspeed zero2 \
27 |     --logging_steps 5 \
28 |     --max_length 4096 \
29 |     --output_dir output \
30 |     --warmup_ratio 0.05 \
31 |     --dataloader_num_workers 4 \
32 |     --dataset_num_proc 4
33 | 


--------------------------------------------------------------------------------
/examples/train/multimodal/video.sh:
--------------------------------------------------------------------------------
 1 | # 4*80GB
 2 | # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `VIDEO_MAX_PIXELS` parameter.
 3 | nproc_per_node=4
 4 | 
 5 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 6 | NPROC_PER_NODE=$nproc_per_node \
 7 | VIDEO_MAX_PIXELS=50176 \
 8 | FPS_MAX_FRAMES=12 \
 9 | swift sft \
10 |     --model Qwen/QVQ-72B-Preview \
11 |     --dataset swift/VideoChatGPT:all \
12 |     --train_type lora \
13 |     --torch_dtype bfloat16 \
14 |     --num_train_epochs 1 \
15 |     --per_device_train_batch_size 1 \
16 |     --per_device_eval_batch_size 1 \
17 |     --learning_rate 1e-4 \
18 |     --lora_rank 8 \
19 |     --lora_alpha 32 \
20 |     --target_modules all-linear \
21 |     --freeze_vit true \
22 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
23 |     --eval_steps 50 \
24 |     --save_steps 50 \
25 |     --save_total_limit 2 \
26 |     --logging_steps 5 \
27 |     --max_length 2048 \
28 |     --output_dir output \
29 |     --warmup_ratio 0.05 \
30 |     --dataloader_num_workers 4 \
31 |     --deepspeed zero3
32 | 


--------------------------------------------------------------------------------
/examples/train/packing/llm.sh:
--------------------------------------------------------------------------------
 1 | # 22GB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --packing true \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 9 |               'swift/self-cognition#500' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 3 \
12 |     --attn_impl flash_attn \
13 |     --per_device_train_batch_size 1 \
14 |     --per_device_eval_batch_size 1 \
15 |     --learning_rate 1e-4 \
16 |     --lora_rank 8 \
17 |     --lora_alpha 32 \
18 |     --target_modules all-linear \
19 |     --gradient_accumulation_steps 4 \
20 |     --eval_steps 50 \
21 |     --save_steps 50 \
22 |     --save_total_limit 2 \
23 |     --logging_steps 5 \
24 |     --max_length 2048 \
25 |     --output_dir output \
26 |     --system 'You are a helpful assistant.' \
27 |     --warmup_ratio 0.05 \
28 |     --dataloader_num_workers 4 \
29 |     --dataset_num_proc 4 \
30 |     --model_author swift \
31 |     --model_name swift-robot
32 | 


--------------------------------------------------------------------------------
/examples/train/packing/streaming.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 36GB
 2 | # A demo using the Hugging Face dataset
 3 | # The first model weights will be saved around step 70.
 4 | NPROC_PER_NODE=4 \
 5 | MAX_PIXELS=1003520 \
 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 7 | HF_ENDPOINT=https://hf-mirror.com \
 8 | swift sft \
 9 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
10 |     --train_type lora \
11 |     --dataset 'HF::linxy/LaTeX_OCR:full#20000' \
12 |     --torch_dtype bfloat16 \
13 |     --attn_impl flash_attn \
14 |     --streaming true \
15 |     --shuffle_buffer_size 1000 \
16 |     --packing true \
17 |     --save_strategy epoch \
18 |     --max_steps 1000 \
19 |     --max_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --learning_rate 1e-4 \
23 |     --lora_rank 8 \
24 |     --lora_alpha 32 \
25 |     --target_modules all-linear \
26 |     --gradient_accumulation_steps 1 \
27 |     --save_total_limit 2 \
28 |     --logging_steps 5 \
29 |     --max_length 8192 \
30 |     --output_dir output \
31 |     --warmup_ratio 0.05 \
32 |     --dataloader_num_workers 1 \
33 |     --dataset_num_proc 8 \
34 |     --deepspeed zero2
35 | 


--------------------------------------------------------------------------------
/examples/train/padding_free/sft.sh:
--------------------------------------------------------------------------------
 1 | # Supported multimodal models reference:
 2 | # https://github.com/modelscope/ms-swift/blob/main/examples/train/packing/qwen2_5_vl.sh
 3 | # without padding_free: 4 * 60GiB, 26h
 4 | # padding_free: 4 * 44GiB, 13h
 5 | NPROC_PER_NODE=4 \
 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 7 | swift sft \
 8 |     --model Qwen/Qwen2.5-7B \
 9 |     --train_type full \
10 |     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
11 |     --torch_dtype bfloat16 \
12 |     --per_device_train_batch_size 8 \
13 |     --per_device_eval_batch_size 8 \
14 |     --learning_rate 1e-5 \
15 |     --gradient_accumulation_steps 1 \
16 |     --eval_steps 200 \
17 |     --save_steps 200 \
18 |     --logging_steps 5 \
19 |     --max_length 8192 \
20 |     --warmup_ratio 0.05 \
21 |     --dataloader_num_workers 8 \
22 |     --dataset_num_proc 8 \
23 |     --save_total_limit 2 \
24 |     --save_only_model true \
25 |     --output_dir output/Qwen2.5-7B \
26 |     --deepspeed zero3 \
27 |     --use_liger_kernel true \
28 |     --attn_impl flash_attn \
29 |     --padding_free true
30 | 


--------------------------------------------------------------------------------
/examples/train/plugins/loss_scale.sh:
--------------------------------------------------------------------------------
 1 | # loss_scale all to train all tokens
 2 | # use loss_type loss_scale
 3 | # This is just an example
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | swift sft \
 6 |     --model Qwen/Qwen2.5-7B-Instruct \
 7 |     --train_type lora \
 8 |     --dataset 'swift/self-cognition#1000' \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --learning_rate 1e-4 \
12 |     --lora_rank 8 \
13 |     --lora_alpha 32 \
14 |     --gradient_accumulation_steps 16 \
15 |     --eval_steps 100 \
16 |     --save_steps 100 \
17 |     --save_total_limit 2 \
18 |     --logging_steps 5 \
19 |     --model_author swift \
20 |     --model_name swift-robot \
21 |     --loss_scale all \
22 |     --loss_type loss_scale
23 | 


--------------------------------------------------------------------------------
/examples/train/plugins/tuner_phi4_mm.sh:
--------------------------------------------------------------------------------
 1 | # `--train_type dummy`
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model LLM-Research/Phi-4-multimodal-instruct \
 5 |     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
 6 |     --train_type dummy \
 7 |     --torch_dtype bfloat16 \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 1 \
10 |     --per_device_eval_batch_size 1 \
11 |     --learning_rate 1e-4 \
12 |     --gradient_accumulation_steps 16 \
13 |     --eval_steps 200 \
14 |     --save_steps 200 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5 \
17 |     --max_length 2048 \
18 |     --output_dir output \
19 |     --warmup_ratio 0.05 \
20 |     --dataloader_num_workers 4
21 | 


--------------------------------------------------------------------------------
/examples/train/predict_with_generate/train.sh:
--------------------------------------------------------------------------------
 1 | # 20GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | MAX_PIXELS=1003520 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-VL-7B-Instruct \
 6 |     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
 7 |     --train_type lora \
 8 |     --torch_dtype bfloat16 \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 2 \
12 |     --learning_rate 1e-4 \
13 |     --lora_rank 8 \
14 |     --lora_alpha 32 \
15 |     --target_modules all-linear \
16 |     --freeze_vit true \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 100 \
19 |     --save_steps 100 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4 \
26 |     --predict_with_generate true \
27 |     --metric_for_best_model rouge-l \
28 |     --greater_is_better true
29 | 


--------------------------------------------------------------------------------
/examples/train/pretrain/train.sh:
--------------------------------------------------------------------------------
 1 | # If not using flash_attn, or transformers<4.44,
 2 | # or encountering an abnormally large loss (i.e., the model does not support packing),
 3 | # please remove `--packing true`.
 4 | nproc_per_node=4
 5 | 
 6 | NPROC_PER_NODE=$nproc_per_node \
 7 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 8 | swift pt \
 9 |     --model Qwen/Qwen2.5-7B \
10 |     --train_type full \
11 |     --dataset swift/chinese-c4 \
12 |     --torch_dtype bfloat16 \
13 |     --streaming true \
14 |     --per_device_train_batch_size 1 \
15 |     --per_device_eval_batch_size 1 \
16 |     --learning_rate 1e-5 \
17 |     --gradient_accumulation_steps $(expr 64 / $nproc_per_node) \
18 |     --packing true \
19 |     --eval_steps 500 \
20 |     --save_steps 500 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --deepspeed zero3 \
24 |     --max_length 8192 \
25 |     --max_steps 10000 \
26 |     --warmup_ratio 0.05 \
27 |     --dataloader_num_workers 4 \
28 |     --dataset_num_proc 8 \
29 |     --save_only_model true \
30 |     --output_dir output/Qwen2.5-7B \
31 |     --attn_impl flash_attn
32 | 


--------------------------------------------------------------------------------
/examples/train/qlora/awq.sh:
--------------------------------------------------------------------------------
 1 | # 10GB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct-AWQ \
 5 |     --train_type lora \
 6 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 7 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 8 |               'swift/self-cognition#500' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 50 \
19 |     --save_steps 50 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --system 'You are a helpful assistant.' \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --model_author swift \
28 |     --model_name swift-robot
29 | 


--------------------------------------------------------------------------------
/examples/train/qlora/bnb.sh:
--------------------------------------------------------------------------------
 1 | # 10GB
 2 | # pip install bitsandbytes
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type lora \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 9 |               'swift/self-cognition#500' \
10 |     --torch_dtype bfloat16 \
11 |     --bnb_4bit_compute_dtype bfloat16 \
12 |     --bnb_4bit_quant_type nf4 \
13 |     --bnb_4bit_use_double_quant true \
14 |     --quant_method bnb \
15 |     --quant_bits 4 \
16 |     --num_train_epochs 1 \
17 |     --per_device_train_batch_size 1 \
18 |     --per_device_eval_batch_size 1 \
19 |     --learning_rate 1e-4 \
20 |     --lora_rank 8 \
21 |     --lora_alpha 32 \
22 |     --target_modules all-linear \
23 |     --gradient_accumulation_steps 16 \
24 |     --eval_steps 50 \
25 |     --save_steps 50 \
26 |     --save_total_limit 2 \
27 |     --logging_steps 5 \
28 |     --max_length 2048 \
29 |     --output_dir output \
30 |     --system 'You are a helpful assistant.' \
31 |     --warmup_ratio 0.05 \
32 |     --dataloader_num_workers 4 \
33 |     --model_author swift \
34 |     --model_name swift-robot
35 | 


--------------------------------------------------------------------------------
/examples/train/qlora/gptq.sh:
--------------------------------------------------------------------------------
 1 | # 2 * 30GiB
 2 | CUDA_VISIBLE_DEVICES=0,1 \
 3 | MAX_PIXELS=1003520 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-VL-72B-Instruct-GPTQ-Int4 \
 6 |     --dataset 'modelscope/coco_2014_caption:validation#20000' \
 7 |     --train_type lora \
 8 |     --torch_dtype bfloat16 \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 1 \
12 |     --learning_rate 1e-4 \
13 |     --lora_rank 8 \
14 |     --lora_alpha 32 \
15 |     --target_modules all-linear \
16 |     --freeze_vit true \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 100 \
19 |     --save_steps 100 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4
26 | 


--------------------------------------------------------------------------------
/examples/train/qlora/hqq.sh:
--------------------------------------------------------------------------------
 1 | # 10GB
 2 | # pip install hqq
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type lora \
 7 |     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
 8 |               'AI-ModelScope/alpaca-gpt4-data-en#500' \
 9 |               'swift/self-cognition#500' \
10 |     --torch_dtype bfloat16 \
11 |     --quant_method hqq \
12 |     --quant_bits 4 \
13 |     --num_train_epochs 1 \
14 |     --per_device_train_batch_size 1 \
15 |     --per_device_eval_batch_size 1 \
16 |     --learning_rate 1e-4 \
17 |     --lora_rank 8 \
18 |     --lora_alpha 32 \
19 |     --target_modules all-linear \
20 |     --gradient_accumulation_steps 16 \
21 |     --eval_steps 50 \
22 |     --save_steps 50 \
23 |     --save_total_limit 2 \
24 |     --logging_steps 5 \
25 |     --max_length 2048 \
26 |     --output_dir output \
27 |     --system 'You are a helpful assistant.' \
28 |     --warmup_ratio 0.05 \
29 |     --dataloader_num_workers 4 \
30 |     --model_author swift \
31 |     --model_name swift-robot
32 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/README.md:
--------------------------------------------------------------------------------
1 | # TIPS
2 | 
3 | Multi-modal models' RLHF are also supported! Check the multimodal folder for details.
4 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/cpo.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=2
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | NPROC_PER_NODE=$nproc_per_node \
 5 | swift rlhf \
 6 |     --rlhf_type cpo \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --deepspeed zero2 \
28 |     --dataset_num_proc 4
29 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/dpo/full.sh:
--------------------------------------------------------------------------------
 1 | # 4 * 50GiB
 2 | NPROC_PER_NODE=4 \
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 \
 4 | swift rlhf \
 5 |     --rlhf_type dpo \
 6 |     --model Qwen/Qwen2.5-7B-Instruct \
 7 |     --train_type full \
 8 |     --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-5 \
14 |     --gradient_accumulation_steps 4 \
15 |     --eval_steps 100 \
16 |     --save_steps 100 \
17 |     --save_total_limit 2 \
18 |     --logging_steps 5 \
19 |     --max_length 8192 \
20 |     --output_dir output \
21 |     --warmup_ratio 0.05 \
22 |     --save_only_model true \
23 |     --dataloader_num_workers 4 \
24 |     --dataset_num_proc 4 \
25 |     --deepspeed zero3 \
26 |     --attn_impl flash_attn
27 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/dpo/lora.sh:
--------------------------------------------------------------------------------
 1 | # 24GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift rlhf \
 4 |     --rlhf_type dpo \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type lora \
 7 |     --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
 8 |     --torch_dtype bfloat16 \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 1 \
12 |     --learning_rate 1e-4 \
13 |     --lora_rank 8 \
14 |     --lora_alpha 32 \
15 |     --target_modules all-linear \
16 |     --gradient_accumulation_steps 16 \
17 |     --eval_steps 100 \
18 |     --save_steps 100 \
19 |     --save_total_limit 2 \
20 |     --logging_steps 5 \
21 |     --max_length 2048 \
22 |     --output_dir output \
23 |     --warmup_ratio 0.05 \
24 |     --dataloader_num_workers 4 \
25 |     --dataset_num_proc 4
26 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/kto.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=2
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | NPROC_PER_NODE=$nproc_per_node \
 5 | swift rlhf \
 6 |     --rlhf_type kto \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
18 |     --eval_steps 100 \
19 |     --save_steps 100 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4 \
26 |     --deepspeed zero2 \
27 |     --dataset_num_proc 4
28 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/orpo.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=2
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | NPROC_PER_NODE=$nproc_per_node \
 5 | swift rlhf \
 6 |     --rlhf_type orpo \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --deepspeed zero2 \
28 |     --dataset_num_proc 4
29 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/rm.sh:
--------------------------------------------------------------------------------
 1 | nproc_per_node=2
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | NPROC_PER_NODE=$nproc_per_node \
 5 | swift rlhf \
 6 |     --rlhf_type rm \
 7 |     --model Qwen/Qwen2.5-7B-Instruct \
 8 |     --train_type lora \
 9 |     --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
19 |     --eval_steps 100 \
20 |     --save_steps 100 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --deepspeed zero2 \
28 |     --dataset_num_proc 4
29 | 


--------------------------------------------------------------------------------
/examples/train/rlhf/simpo.sh:
--------------------------------------------------------------------------------
 1 | # 2*50GB
 2 | nproc_per_node=2
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0,1 \
 5 | NPROC_PER_NODE=$nproc_per_node \
 6 | swift rlhf \
 7 |     --rlhf_type simpo \
 8 |     --model Qwen/Qwen2.5-3B-Instruct \
 9 |     --train_type full \
10 |     --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
11 |     --torch_dtype bfloat16 \
12 |     --num_train_epochs 1 \
13 |     --per_device_train_batch_size 1 \
14 |     --per_device_eval_batch_size 1 \
15 |     --learning_rate 1e-5 \
16 |     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
17 |     --eval_steps 100 \
18 |     --save_steps 100 \
19 |     --save_total_limit 2 \
20 |     --logging_steps 5 \
21 |     --max_length 2048 \
22 |     --output_dir output \
23 |     --warmup_ratio 0.05 \
24 |     --dataloader_num_workers 4 \
25 |     --deepspeed zero2 \
26 |     --dataset_num_proc 4
27 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/bert/deploy.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift deploy \
 3 |     --adapters output/vx-xxx/checkpoint-xxx \
 4 |     --served_model_name bert-base-chinese \
 5 |     --truncation_strategy right \
 6 |     --max_length 512
 7 | 
 8 | # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
 9 | # "model": "bert-base-chinese",
10 | # "messages": [{"role": "user", "content": "包装差，容易被调包。"}]
11 | # }'
12 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/bert/infer.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift infer \
3 |     --adapters output/vx-xxx/checkpoint-xxx \
4 |     --load_data_args true \
5 |     --max_batch_size 16 \
6 |     --truncation_strategy right \
7 |     --max_length 512
8 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/bert/sft.sh:
--------------------------------------------------------------------------------
 1 | # If `num_labels` is provided, it will be considered a classification task,
 2 | # and AutoModelForSequenceClassification will be used to load the model.
 3 | # The BERT model does not require templates, so it can usually be used without registration.
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | swift sft \
 6 |     --model AI-ModelScope/bert-base-chinese \
 7 |     --train_type lora \
 8 |     --dataset 'DAMO_NLP/jd:cls#2000' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 50 \
19 |     --save_steps 50 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 512 \
23 |     --truncation_strategy right \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --num_labels 2 \
28 |     --task_type seq_cls
29 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/multi_label/sft.sh:
--------------------------------------------------------------------------------
 1 | # Custom dataset format reference: https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-0.5B \
 5 |     --train_type lora \
 6 |     --dataset '<your-dataset>' \
 7 |     --torch_dtype bfloat16 \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 16 \
10 |     --per_device_eval_batch_size 16 \
11 |     --learning_rate 1e-4 \
12 |     --lora_rank 8 \
13 |     --lora_alpha 32 \
14 |     --target_modules all-linear \
15 |     --gradient_accumulation_steps 1 \
16 |     --eval_steps 100 \
17 |     --save_steps 100 \
18 |     --save_total_limit 2 \
19 |     --logging_steps 5 \
20 |     --max_length 2048 \
21 |     --output_dir output \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 4 \
24 |     --dataset_num_proc 4 \
25 |     --num_labels '<num-labels>' \
26 |     --task_type seq_cls \
27 |     --use_chat_template false \
28 |     --problem_type multi_label_classification
29 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/qwen2_5/deploy.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift deploy \
3 |     --adapters output/vx-xxx/checkpoint-xxx
4 | 
5 | # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
6 | # "model": "Qwen2.5-0.5B",
7 | # "messages": [{"role": "user", "content": "包装差，容易被调包。"}]
8 | # }'
9 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/qwen2_5/infer.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift infer \
3 |     --adapters output/vx-xxx/checkpoint-xxx \
4 |     --load_data_args true \
5 |     --max_batch_size 16
6 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/qwen2_5/sft.sh:
--------------------------------------------------------------------------------
 1 | # If `num_labels` is provided, it will be considered a classification task,
 2 | # and AutoModelForSequenceClassification will be used to load the model.
 3 | # You can also specify `--model Qwen/Qwen2.5-0.5B-Instruct --use_chat_template true`.
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | swift sft \
 6 |     --model Qwen/Qwen2.5-0.5B \
 7 |     --train_type lora \
 8 |     --dataset 'DAMO_NLP/jd:cls#2000' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 50 \
19 |     --save_steps 50 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4 \
26 |     --num_labels 2 \
27 |     --task_type seq_cls \
28 |     --use_chat_template false
29 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/qwen2_vl/infer.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | MAX_PIXELS=1003520 \
3 | swift infer \
4 |     --adapters output/vx-xxx/checkpoint-xxx \
5 |     --load_data_args true
6 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/qwen2_vl/sft.sh:
--------------------------------------------------------------------------------
 1 | # If `num_labels` is provided, it will be considered a classification task.
 2 | # You can also specify `--model Qwen/Qwen2.5-VL-2B-Instruct --use_chat_template true`.
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | MAX_PIXELS=1003520 \
 5 | swift sft \
 6 |     --model Qwen/Qwen2-VL-2B \
 7 |     --train_type lora \
 8 |     --dataset 'tany0699/garbage265#20000' \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --gradient_accumulation_steps 16 \
18 |     --eval_steps 50 \
19 |     --save_steps 50 \
20 |     --save_total_limit 2 \
21 |     --logging_steps 5 \
22 |     --max_length 2048 \
23 |     --output_dir output \
24 |     --warmup_ratio 0.05 \
25 |     --dataloader_num_workers 4 \
26 |     --num_labels 265 \
27 |     --task_type seq_cls \
28 |     --use_chat_template false
29 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/regression/deploy.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift deploy \
3 |     --adapters output/vx-xxx/checkpoint-xxx
4 | 
5 | # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
6 | # "model": "Qwen2.5-0.5B",
7 | # "messages": [{"role": "user", "content": "Task: Based on the given two sentences, provide a similarity score between 0.0 and 1.0.\nSentence 1: The animal is eating.\nSentence 2: A woman is dancing.\nSimilarity score: "}]
8 | # }'
9 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/regression/infer.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 \
2 | swift infer \
3 |     --adapters output/vx-xxx/checkpoint-xxx \
4 |     --load_data_args true \
5 |     --max_batch_size 16
6 | 


--------------------------------------------------------------------------------
/examples/train/seq_cls/regression/sft.sh:
--------------------------------------------------------------------------------
 1 | # 2GB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-0.5B \
 5 |     --train_type lora \
 6 |     --dataset 'sentence-transformers/stsb:reg#20000' \
 7 |     --torch_dtype bfloat16 \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 16 \
10 |     --per_device_eval_batch_size 16 \
11 |     --learning_rate 1e-4 \
12 |     --lora_rank 8 \
13 |     --lora_alpha 32 \
14 |     --target_modules all-linear \
15 |     --gradient_accumulation_steps 1 \
16 |     --eval_steps 100 \
17 |     --save_steps 100 \
18 |     --save_total_limit 2 \
19 |     --logging_steps 5 \
20 |     --max_length 2048 \
21 |     --output_dir output \
22 |     --warmup_ratio 0.05 \
23 |     --dataloader_num_workers 4 \
24 |     --dataset_num_proc 4 \
25 |     --num_labels 1 \
26 |     --task_type seq_cls \
27 |     --use_chat_template false \
28 |     --problem_type regression
29 | 


--------------------------------------------------------------------------------
/examples/train/streaming/train.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |     --model Qwen/Qwen2.5-7B-Instruct \
 4 |     --train_type lora \
 5 |     --dataset 'swift/self-cognition#1000' \
 6 |     --streaming true \
 7 |     --max_steps 1000 \
 8 |     --learning_rate 1e-4 \
 9 |     --lora_rank 8 \
10 |     --lora_alpha 32 \
11 |     --gradient_accumulation_steps 16 \
12 |     --eval_steps 100 \
13 |     --save_steps 100 \
14 |     --save_total_limit 2 \
15 |     --logging_steps 5 \
16 |     --model_author swift \
17 |     --model_name swift-robot
18 | 


--------------------------------------------------------------------------------
/examples/train/think_model/deepseek_r1.sh:
--------------------------------------------------------------------------------
 1 | # 18GB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model deepseek-ai/DeepSeek-R1-0528-Qwen3-8B \
 5 |     --train_type lora \
 6 |     --dataset 'swift/DeepSeek-R1-Qwen3-8B-Distill#1800' \
 7 |               'swift/self-cognition:empty_think#600' \
 8 |     --loss_scale ignore_empty_think \
 9 |     --torch_dtype bfloat16 \
10 |     --num_train_epochs 1 \
11 |     --per_device_train_batch_size 1 \
12 |     --per_device_eval_batch_size 1 \
13 |     --learning_rate 1e-4 \
14 |     --lora_rank 8 \
15 |     --lora_alpha 32 \
16 |     --target_modules all-linear \
17 |     --gradient_accumulation_steps 16 \
18 |     --load_from_cache_file false \
19 |     --eval_steps 50 \
20 |     --save_steps 50 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --use_liger_kernel true \
28 |     --model_author swift \
29 |     --model_name swift-robot
30 | 


--------------------------------------------------------------------------------
/examples/train/think_model/qwen3_demo1.sh:
--------------------------------------------------------------------------------
 1 | # use `--loss_scale ignore_empty_think`
 2 | # Avoid losing the think capability by ignoring the loss of empty `<think>\n\n</think>\n\n`
 3 | # This method is also applicable to the Deepseek-R1 series of models.
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | swift sft \
 6 |     --model Qwen/Qwen3-8B \
 7 |     --train_type lora \
 8 |     --dataset 'swift/Qwen3-SFT-Mixin#2000' \
 9 |               'swift/self-cognition:empty_think#600' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps 16 \
19 |     --eval_steps 50 \
20 |     --save_steps 50 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --use_liger_kernel true \
28 |     --load_from_cache_file false \
29 |     --loss_scale ignore_empty_think \
30 |     --model_author swift \
31 |     --model_name swift-robot
32 | 


--------------------------------------------------------------------------------
/examples/train/think_model/qwen3_demo2.sh:
--------------------------------------------------------------------------------
 1 | # use `swift/self-cognition:qwen3`
 2 | # Avoid losing the thinking capability by appending `/no_think` to the dataset query.
 3 | # https://github.com/modelscope/ms-swift/blob/77985c2ccdac8ed4037174ee222e79d1f1d5059d/swift/llm/dataset/dataset/llm.py#L835
 4 | CUDA_VISIBLE_DEVICES=0 \
 5 | swift sft \
 6 |     --model Qwen/Qwen3-8B \
 7 |     --train_type lora \
 8 |     --dataset 'swift/Qwen3-SFT-Mixin#2000' \
 9 |               'swift/self-cognition:qwen3#600' \
10 |     --torch_dtype bfloat16 \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --learning_rate 1e-4 \
15 |     --lora_rank 8 \
16 |     --lora_alpha 32 \
17 |     --target_modules all-linear \
18 |     --gradient_accumulation_steps 16 \
19 |     --eval_steps 50 \
20 |     --save_steps 50 \
21 |     --save_total_limit 2 \
22 |     --logging_steps 5 \
23 |     --max_length 2048 \
24 |     --output_dir output \
25 |     --warmup_ratio 0.05 \
26 |     --dataloader_num_workers 4 \
27 |     --use_liger_kernel true \
28 |     --load_from_cache_file false \
29 |     --model_author swift \
30 |     --model_name swift-robot
31 | 


--------------------------------------------------------------------------------
/examples/train/tuners/adalora/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type adalora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --gradient_accumulation_steps 16 \
11 |     --eval_steps 100 \
12 |     --save_steps 100 \
13 |     --save_total_limit 2 \
14 |     --logging_steps 5 \
15 |     --model_author swift \
16 |     --model_name swift-robot
17 | 


--------------------------------------------------------------------------------
/examples/train/tuners/adapter/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type adapter \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --gradient_accumulation_steps 16 \
11 |     --eval_steps 100 \
12 |     --save_steps 100 \
13 |     --save_total_limit 2 \
14 |     --logging_steps 5 \
15 |     --model_author swift \
16 |     --model_name swift-robot
17 | 


--------------------------------------------------------------------------------
/examples/train/tuners/boft/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type boft \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --gradient_accumulation_steps 16 \
11 |     --eval_steps 100 \
12 |     --save_steps 100 \
13 |     --save_total_limit 2 \
14 |     --logging_steps 5 \
15 |     --model_author swift \
16 |     --model_name swift-robot
17 | 


--------------------------------------------------------------------------------
/examples/train/tuners/bone/train.sh:
--------------------------------------------------------------------------------
 1 | # 17.3GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type bone \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --gradient_accumulation_steps 16 \
11 |     --eval_steps 100 \
12 |     --save_steps 100 \
13 |     --save_total_limit 2 \
14 |     --logging_steps 5 \
15 |     --model_author swift \
16 |     --model_name swift-robot
17 | 


--------------------------------------------------------------------------------
/examples/train/tuners/dora/train.sh:
--------------------------------------------------------------------------------
 1 | # 17.2GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --use_dora true \
 7 |     --dataset 'swift/self-cognition#1000' \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 1 \
10 |     --learning_rate 1e-4 \
11 |     --lora_rank 8 \
12 |     --lora_alpha 32 \
13 |     --gradient_accumulation_steps 16 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/examples/train/tuners/galore/train_galore.sh:
--------------------------------------------------------------------------------
 1 | # 38GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type full \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-5 \
10 |     --gradient_accumulation_steps 16 \
11 |     --eval_steps 100 \
12 |     --save_steps 100 \
13 |     --save_total_limit 2 \
14 |     --logging_steps 5 \
15 |     --model_author swift \
16 |     --model_name swift-robot \
17 |     --use_galore true \
18 |     --galore_optim_per_parameter true
19 | 


--------------------------------------------------------------------------------
/examples/train/tuners/galore/train_qgalore.sh:
--------------------------------------------------------------------------------
 1 | # 35GiB
 2 | # pip install bitsandbytes==0.40.0
 3 | CUDA_VISIBLE_DEVICES=0 \
 4 | swift sft \
 5 |     --model Qwen/Qwen2.5-7B-Instruct \
 6 |     --train_type full \
 7 |     --torch_dtype bfloat16 \
 8 |     --dataset 'lvjianjin/AdvertiseGen#1000' \
 9 |     --num_train_epochs 1 \
10 |     --per_device_train_batch_size 1 \
11 |     --learning_rate 1e-5 \
12 |     --gradient_accumulation_steps 16 \
13 |     --eval_steps 100 \
14 |     --save_steps 100 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5 \
17 |     --model_author swift \
18 |     --model_name swift-robot \
19 |     --use_galore true \
20 |     --galore_quantization true
21 | 


--------------------------------------------------------------------------------
/examples/train/tuners/lisa/train.sh:
--------------------------------------------------------------------------------
 1 | # 29GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type full \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --lisa_activated_layers 2 \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 1 \
10 |     --learning_rate 1e-5 \
11 |     --gradient_accumulation_steps 16 \
12 |     --eval_steps 100 \
13 |     --save_steps 100 \
14 |     --save_total_limit 2 \
15 |     --logging_steps 5 \
16 |     --model_author swift \
17 |     --model_name swift-robot
18 | 


--------------------------------------------------------------------------------
/examples/train/tuners/llamapro/train.sh:
--------------------------------------------------------------------------------
 1 | # 25.4GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type llamapro \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --llamapro_num_new_blocks 4 \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 1 \
10 |     --learning_rate 1e-4 \
11 |     --gradient_accumulation_steps 16 \
12 |     --eval_steps 100 \
13 |     --save_steps 100 \
14 |     --save_total_limit 2 \
15 |     --logging_steps 5 \
16 |     --model_author swift \
17 |     --model_name swift-robot
18 | 


--------------------------------------------------------------------------------
/examples/train/tuners/longlora/train.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |     --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
 4 |     --train_type longlora \
 5 |     --dataset 'AI-ModelScope/LongAlpaca-12k#1000' \
 6 |     --num_train_epochs 1 \
 7 |     --learning_rate 1e-4 \
 8 |     --attn_impl flash_attn \
 9 |     --gradient_accumulation_steps 16 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --eval_steps 100 \
13 |     --save_steps 100 \
14 |     --max_length 10000 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5
17 | 


--------------------------------------------------------------------------------
/examples/train/tuners/lora-ga/train.sh:
--------------------------------------------------------------------------------
 1 | # Train
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2-1.5B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --init_weights lora-ga \
13 |     --lora_ga_batch_size 2 \
14 |     --lora_ga_iters 2 \
15 |     --lora_ga_max_length 1024 \
16 |     --lora_ga_direction ArB2r \
17 |     --lora_ga_scale stable \
18 |     --lora_ga_stable_gamma 16 \
19 |     --gradient_accumulation_steps 16 \
20 |     --eval_steps 100 \
21 |     --save_steps 100 \
22 |     --save_total_limit 2 \
23 |     --logging_steps 5 \
24 |     --model_author swift \
25 |     --model_name swift-robot
26 | 
27 | # Infer
28 | # swift infer \
29 | #     --model Qwen/Qwen2-1.5B-Instruct \
30 | #     --ckpt_dir ./output/Qwen2-1.5B-Instruct/v0-20241214-191235/checkpoint-62/converted/default \
31 | #     --infer_backend pt \
32 | #     --stream true \
33 | #     --max_new_tokens 2048
34 | 


--------------------------------------------------------------------------------
/examples/train/tuners/lora/train.sh:
--------------------------------------------------------------------------------
 1 | # 17.2GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --gradient_accumulation_steps 16 \
13 |     --eval_steps 100 \
14 |     --save_steps 100 \
15 |     --save_total_limit 2 \
16 |     --logging_steps 5 \
17 |     --model_author swift \
18 |     --model_name swift-robot
19 | 


--------------------------------------------------------------------------------
/examples/train/tuners/neftune/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --neftune_noise_alpha 15 \
10 |     --learning_rate 1e-4 \
11 |     --lora_rank 8 \
12 |     --lora_alpha 32 \
13 |     --gradient_accumulation_steps 16 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/examples/train/tuners/olora/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --init_lora_weights olora \
13 |     --gradient_accumulation_steps 16 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/examples/train/tuners/pissa/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --train_type lora \
 6 |     --dataset 'swift/self-cognition#1000' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --lora_rank 8 \
11 |     --lora_alpha 32 \
12 |     --init_lora_weights pissa \
13 |     --gradient_accumulation_steps 16 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/examples/train/tuners/qlora/train.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |     --model Qwen/Qwen2.5-7B-Instruct \
 4 |     --train_type lora \
 5 |     --dataset 'swift/self-cognition#1000' \
 6 |     --num_train_epochs 1 \
 7 |     --per_device_train_batch_size 1 \
 8 |     --learning_rate 1e-4 \
 9 |     --lora_rank 8 \
10 |     --lora_alpha 32 \
11 |     --gradient_accumulation_steps 16 \
12 |     --eval_steps 100 \
13 |     --save_steps 100 \
14 |     --save_total_limit 2 \
15 |     --logging_steps 5 \
16 |     --model_author swift \
17 |     --model_name swift-robot \
18 |     --quant_bits 4 \
19 |     --quant_method bnb
20 | 


--------------------------------------------------------------------------------
/examples/train/tuners/reft/train.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | swift sft \
 3 |     --model Qwen/Qwen2.5-7B-Instruct \
 4 |     --train_type reft \
 5 |     --dataset 'swift/self-cognition#1000' \
 6 |     --reft_intervention_type 'LoreftIntervention' \
 7 |     --num_train_epochs 1 \
 8 |     --per_device_train_batch_size 1 \
 9 |     --learning_rate 1e-4 \
10 |     --gradient_checkpointing false \
11 |     --gradient_accumulation_steps 16 \
12 |     --eval_steps 100 \
13 |     --save_steps 100 \
14 |     --save_total_limit 2 \
15 |     --logging_steps 5 \
16 |     --model_author swift \
17 |     --model_name swift-robot
18 | 


--------------------------------------------------------------------------------
/examples/train/tuners/unsloth/train.sh:
--------------------------------------------------------------------------------
 1 | # 17GiB
 2 | CUDA_VISIBLE_DEVICES=0 \
 3 | swift sft \
 4 |     --model Qwen/Qwen2.5-7B-Instruct \
 5 |     --tuner_backend unsloth \
 6 |     --train_type lora \
 7 |     --dataset 'swift/self-cognition#1000' \
 8 |     --num_train_epochs 1 \
 9 |     --per_device_train_batch_size 1 \
10 |     --learning_rate 1e-4 \
11 |     --lora_rank 8 \
12 |     --lora_alpha 32 \
13 |     --gradient_accumulation_steps 16 \
14 |     --eval_steps 100 \
15 |     --save_steps 100 \
16 |     --save_total_limit 2 \
17 |     --logging_steps 5 \
18 |     --model_author swift \
19 |     --model_name swift-robot
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/framework.txt
2 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | docutils>=0.16.0
2 | myst_parser
3 | recommonmark
4 | sphinx>=5.3.0
5 | sphinx-book-theme
6 | sphinx-copybutton
7 | sphinx-rtd-theme
8 | sphinx_markdown_tables
9 | 


--------------------------------------------------------------------------------
/requirements/eval.txt:
--------------------------------------------------------------------------------
1 | evalscope[opencompass]
2 | evalscope[vlmeval]
3 | 


--------------------------------------------------------------------------------
/requirements/framework.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | addict
 3 | aiohttp
 4 | attrdict
 5 | binpacking
 6 | charset_normalizer
 7 | cpm_kernels
 8 | dacite
 9 | datasets>=3.0,<3.4
10 | einops
11 | fastapi
12 | gradio>=3.40.0
13 | importlib_metadata
14 | jieba
15 | matplotlib
16 | modelscope>=1.23
17 | nltk
18 | numpy<2.0
19 | openai
20 | oss2
21 | pandas
22 | peft>=0.11,<0.16
23 | pillow
24 | requests
25 | rouge
26 | safetensors
27 | scipy
28 | sentencepiece
29 | simplejson>=3.3.0
30 | sortedcontainers>=1.5.9
31 | tensorboard
32 | tiktoken
33 | tqdm
34 | transformers>=4.33,<4.53
35 | transformers_stream_generator
36 | trl>=0.15,<0.20
37 | uvicorn
38 | zstandard
39 | 


--------------------------------------------------------------------------------
/requirements/install_all.sh:
--------------------------------------------------------------------------------
 1 | # please use python=3.10, cuda12.*
 2 | # sh requirements/install_all.sh
 3 | pip install "vllm>=0.5.1,<0.9" -U
 4 | pip install "lmdeploy>=0.5" -U --no-deps
 5 | pip install autoawq -U --no-deps
 6 | pip install auto_gptq optimum bitsandbytes -U
 7 | pip install git+https://github.com/modelscope/ms-swift.git
 8 | pip install timm -U
 9 | pip install deepspeed -U
10 | pip install qwen_vl_utils qwen_omni_utils decord librosa icecream soundfile -U
11 | pip install liger_kernel nvitop pre-commit -U
12 | # flash-attn: https://github.com/Dao-AILab/flash-attention/releases
13 | 


--------------------------------------------------------------------------------
/requirements/seq_parallel.txt:
--------------------------------------------------------------------------------
1 | xtuner
2 | 


--------------------------------------------------------------------------------
/requirements/swanlab.txt:
--------------------------------------------------------------------------------
1 | swanlab
2 | 


--------------------------------------------------------------------------------
/requirements/tests.txt:
--------------------------------------------------------------------------------
1 | expecttest
2 | flake8
3 | isort>=4.3.21
4 | modelscope
5 | pre-commit
6 | yapf==0.30.0 # use fix version to ensure consistent auto-styling
7 | 


--------------------------------------------------------------------------------
/scripts/utils/plot_loss.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from swift.utils import plot_images
 4 | 
 5 | ckpt_dir = 'output/xxx/vx-xxx'
 6 | if __name__ == '__main__':
 7 |     images_dir = os.path.join(ckpt_dir, 'images')
 8 |     tb_dir = os.path.join(ckpt_dir, 'runs')
 9 |     plot_images(images_dir, tb_dir, ['train/loss'], 0.9)
10 | 


--------------------------------------------------------------------------------
/scripts/utils/run_template.py:
--------------------------------------------------------------------------------
1 | from swift.llm import TemplateType
2 | 
3 | if __name__ == '__main__':
4 |     template_name_list = TemplateType.get_template_name_list()
5 |     tn_gen = ', '.join([tn for tn in template_name_list if 'generation' in tn])
6 |     tn_chat = ', '.join([tn for tn in template_name_list if 'generation' not in tn])
7 |     print(f'Text Generation: {tn_gen}')
8 |     print(f'Chat: {tn_chat}')
9 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length = 120
 3 | multi_line_output = 0
 4 | known_standard_library = setuptools
 5 | known_first_party = swift
 6 | known_third_party = json,yaml
 7 | no_lines_before = STDLIB,LOCALFOLDER
 8 | default_section = THIRDPARTY
 9 | 
10 | [yapf]
11 | BASED_ON_STYLE = pep8
12 | COLUMN_LIMIT = 120
13 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
14 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
15 | SPLIT_BEFORE_ARITHMETIC_OPERATOR = true
16 | 
17 | [codespell]
18 | skip = *.ipynb
19 | quiet-level = 3
20 | ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
21 | 
22 | [flake8]
23 | max-line-length = 120
24 | select = B,C,E,F,P,T4,W,B9
25 | ignore = F401,F403,F405,F821,W503,E251,W504,E126
26 | exclude = docs/src,*.pyi,.git,peft.py
27 | 
28 | [darglint]
29 | ignore=DAR101
30 | 
31 | [easy_install]
32 | index-url=https://pypi.tuna.tsinghua.edu.cn/simple
33 | 


--------------------------------------------------------------------------------
/swift/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/swift/cli/__init__.py


--------------------------------------------------------------------------------
/swift/cli/_megatron/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/swift/cli/_megatron/__init__.py


--------------------------------------------------------------------------------
/swift/cli/_megatron/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from typing import Dict
 3 | 
 4 | from swift.utils import get_logger
 5 | from ..main import cli_main as swift_cli_main
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | ROUTE_MAPPING: Dict[str, str] = {
10 |     'pt': 'swift.cli._megatron.pt',
11 |     'sft': 'swift.cli._megatron.sft',
12 | }
13 | 
14 | 
15 | def cli_main():
16 |     return swift_cli_main(ROUTE_MAPPING)
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     cli_main()
21 | 


--------------------------------------------------------------------------------
/swift/cli/_megatron/pt.py:
--------------------------------------------------------------------------------
1 | from swift.megatron import megatron_pt_main
2 | 
3 | if __name__ == '__main__':
4 |     megatron_pt_main()
5 | 


--------------------------------------------------------------------------------
/swift/cli/_megatron/sft.py:
--------------------------------------------------------------------------------
1 | from swift.megatron import megatron_sft_main
2 | 
3 | if __name__ == '__main__':
4 |     megatron_sft_main()
5 | 


--------------------------------------------------------------------------------
/swift/cli/app.py:
--------------------------------------------------------------------------------
1 | from swift.llm import app_main
2 | 
3 | if __name__ == '__main__':
4 |     app_main()
5 | 


--------------------------------------------------------------------------------
/swift/cli/deploy.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import deploy_main
3 | 
4 | if __name__ == '__main__':
5 |     deploy_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/eval.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import eval_main
3 | 
4 | if __name__ == '__main__':
5 |     eval_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/export.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import export_main
3 | 
4 | if __name__ == '__main__':
5 |     export_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/infer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import infer_main
3 | 
4 | if __name__ == '__main__':
5 |     infer_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/merge_lora.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from swift.llm import ExportArguments, SwiftPipeline, merge_lora
 3 | 
 4 | 
 5 | class SwiftMergeLoRA(SwiftPipeline):
 6 |     args_class = ExportArguments
 7 |     args: args_class
 8 | 
 9 |     def run(self):
10 |         merge_lora(self.args)
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     SwiftMergeLoRA().main()
15 | 


--------------------------------------------------------------------------------
/swift/cli/pt.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import pt_main
3 | 
4 | if __name__ == '__main__':
5 |     pt_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/rlhf.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import rlhf_main
3 | 
4 | if __name__ == '__main__':
5 |     rlhf_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/rollout.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm import rollout_main
3 | 
4 | if __name__ == '__main__':
5 |     rollout_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/sample.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.llm.sampling import sampling_main
3 | 
4 | if __name__ == '__main__':
5 |     sampling_main()
6 | 


--------------------------------------------------------------------------------
/swift/cli/sft.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | import os
3 | 
4 | from swift.llm import sft_main
5 | 
6 | if __name__ == '__main__':
7 |     sft_main()
8 | 


--------------------------------------------------------------------------------
/swift/cli/web_ui.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.ui import webui_main
3 | 
4 | if __name__ == '__main__':
5 |     webui_main()
6 | 


--------------------------------------------------------------------------------
/swift/hub/__init__.py:
--------------------------------------------------------------------------------
1 | from .hub import HFHub, MSHub, get_hub
2 | 


--------------------------------------------------------------------------------
/swift/hub/constant.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from modelscope.hub import constants
3 | 
4 | constants.API_HTTP_CLIENT_TIMEOUT = 5
5 | constants.API_FILE_DOWNLOAD_TIMEOUT = 300
6 | constants.API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 16
7 | 


--------------------------------------------------------------------------------
/swift/llm/app/__init__.py:
--------------------------------------------------------------------------------
1 | from .app import SwiftApp, app_main
2 | 


--------------------------------------------------------------------------------
/swift/llm/app/locale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | locale_mapping = {
 3 |     'modify_system': {
 4 |         'en': '🛠️ Set system and clear history',
 5 |         'zh': '🛠️ 设置system并清空历史'
 6 |     },
 7 |     'clear_history': {
 8 |         'en': '🧹 Clear history',
 9 |         'zh': '🧹 清空历史'
10 |     },
11 |     'submit': {
12 |         'en': '🚀 Send',
13 |         'zh': '🚀 发送'
14 |     },
15 |     'regenerate': {
16 |         'en': '🤔️ Regenerate',
17 |         'zh': '🤔️ 重试'
18 |     },
19 |     'upload': {
20 |         'en': '📁 Upload',
21 |         'zh': '📁 上传'
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/swift/llm/argument/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from .app_args import AppArguments
 3 | from .base_args import BaseArguments
 4 | from .deploy_args import DeployArguments
 5 | from .eval_args import EvalArguments
 6 | from .export_args import ExportArguments
 7 | from .infer_args import InferArguments
 8 | from .rlhf_args import RLHFArguments
 9 | from .sampling_args import SamplingArguments
10 | from .train_args import TrainArguments
11 | from .tuner_args import TunerArguments
12 | from .webui_args import WebUIArguments
13 | 


--------------------------------------------------------------------------------
/swift/llm/argument/base_args/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .base_args import BaseArguments
3 | from .utils import to_abspath
4 | 


--------------------------------------------------------------------------------
/swift/llm/argument/base_args/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | import os
 3 | from typing import List, Union
 4 | 
 5 | 
 6 | def to_abspath(path: Union[str, List[str], None], check_path_exist: bool = False) -> Union[str, List[str], None]:
 7 |     """Check the path for validity and convert it to an absolute path.
 8 | 
 9 |     Args:
10 |         path: The path to be checked/converted
11 |         check_path_exist: Whether to check if the path exists
12 | 
13 |     Returns:
14 |         Absolute path
15 |     """
16 |     if path is None:
17 |         return
18 |     elif isinstance(path, str):
19 |         # Remove user path prefix and convert to absolute path.
20 |         path = os.path.abspath(os.path.expanduser(path))
21 |         if check_path_exist and not os.path.exists(path):
22 |             raise FileNotFoundError(f"path: '{path}'")
23 |         return path
24 |     assert isinstance(path, list), f'path: {path}'
25 |     res = []
26 |     for v in path:
27 |         res.append(to_abspath(v, check_path_exist))
28 |     return res
29 | 


--------------------------------------------------------------------------------
/swift/llm/argument/merge_args.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from dataclasses import dataclass
 3 | 
 4 | from swift.utils import get_logger
 5 | 
 6 | logger = get_logger()
 7 | 
 8 | 
 9 | @dataclass
10 | class MergeArguments:
11 |     """
12 |     MergeArguments is a dataclass that holds configuration for merging models.
13 | 
14 |     Args:
15 |         merge_lora (bool): Flag to indicate if LoRA merging is enabled. Default is False.
16 |         safe_serialization(bool): Use safetensors or not, default `True`.
17 |         max_shard_size(str): The max size of single shard file.
18 |     """
19 |     merge_lora: bool = False
20 |     safe_serialization: bool = True
21 |     max_shard_size: str = '5GB'
22 | 


--------------------------------------------------------------------------------
/swift/llm/argument/webui_args.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class WebUIArguments:
 7 |     """
 8 |     Args:
 9 |         server_name (str): The hostname or IP address to bind the web UI server to. Default is '0.0.0.0'.
10 |         server_port (int): The port number to bind the web UI server to. Default is 7860.
11 |         share (bool): A flag indicating whether to share the web UI publicly. Default is False.
12 |         lang (str): The language setting for the web UI. Default is 'zh'.
13 |     """
14 |     server_name: str = '0.0.0.0'
15 |     server_port: int = 7860
16 |     share: bool = False
17 |     lang: str = 'zh'
18 | 


--------------------------------------------------------------------------------
/swift/llm/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | import datasets.fingerprint
 3 | from datasets import Dataset as HfDataset
 4 | 
 5 | from ..utils import get_temporary_cache_files_directory
 6 | from . import dataset
 7 | from .loader import DATASET_TYPE, load_dataset
 8 | from .media import MediaResource
 9 | from .preprocessor import (AlpacaPreprocessor, AutoPreprocessor, MessagesPreprocessor, ResponsePreprocessor,
10 |                            RowPreprocessor)
11 | from .register import DATASET_MAPPING, DatasetMeta, SubsetDataset, register_dataset, register_dataset_info
12 | from .utils import (EncodePreprocessor, GetLengthPreprocessor, IterablePackingDataset, LazyLLMDataset, PackingDataset,
13 |                     sample_dataset)
14 | 
15 | datasets.fingerprint.get_temporary_cache_files_directory = get_temporary_cache_files_directory
16 | datasets.arrow_dataset.get_temporary_cache_files_directory = get_temporary_cache_files_directory
17 | register_dataset_info()
18 | 


--------------------------------------------------------------------------------
/swift/llm/dataset/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from . import llm, mllm
3 | 


--------------------------------------------------------------------------------
/swift/llm/dataset/preprocessor/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .core import (DATASET_TYPE, AlpacaPreprocessor, AutoPreprocessor, ClsPreprocessor, MessagesPreprocessor,
3 |                    ResponsePreprocessor, RowPreprocessor)
4 | from .extra import ClsGenerationPreprocessor, GroundingMixin, TextGenerationPreprocessor
5 | 


--------------------------------------------------------------------------------
/swift/llm/ds_config/zero0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "zero_optimization": {
16 |         "stage": 0,
17 |         "allgather_partitions": true,
18 |         "allgather_bucket_size": 2e8,
19 |         "overlap_comm": false,
20 |         "reduce_scatter": true,
21 |         "reduce_bucket_size": 2e8,
22 |         "contiguous_gradients": true
23 |     },
24 | 
25 |     "gradient_accumulation_steps": "auto",
26 |     "gradient_clipping": "auto",
27 |     "steps_per_print": 2000,
28 |     "train_batch_size": "auto",
29 |     "train_micro_batch_size_per_gpu": "auto",
30 |     "wall_clock_breakdown": false
31 | }
32 | 


--------------------------------------------------------------------------------
/swift/llm/ds_config/zero1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "zero_optimization": {
16 |         "stage": 1,
17 |         "offload_optimizer": {
18 |             "device": "none",
19 |             "pin_memory": true
20 |         },
21 |         "allgather_partitions": true,
22 |         "allgather_bucket_size": 2e8,
23 |         "overlap_comm": false,
24 |         "reduce_scatter": true,
25 |         "reduce_bucket_size": 2e8,
26 |         "contiguous_gradients": true
27 |     },
28 | 
29 |     "gradient_accumulation_steps": "auto",
30 |     "gradient_clipping": "auto",
31 |     "steps_per_print": 2000,
32 |     "train_batch_size": "auto",
33 |     "train_micro_batch_size_per_gpu": "auto",
34 |     "wall_clock_breakdown": false
35 | }
36 | 


--------------------------------------------------------------------------------
/swift/llm/ds_config/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "zero_optimization": {
16 |         "stage": 2,
17 |         "offload_optimizer": {
18 |             "device": "none",
19 |             "pin_memory": true
20 |         },
21 |         "allgather_partitions": true,
22 |         "allgather_bucket_size": 2e8,
23 |         "overlap_comm": false,
24 |         "reduce_scatter": true,
25 |         "reduce_bucket_size": 2e8,
26 |         "contiguous_gradients": true
27 |     },
28 | 
29 |     "gradient_accumulation_steps": "auto",
30 |     "gradient_clipping": "auto",
31 |     "steps_per_print": 2000,
32 |     "train_batch_size": "auto",
33 |     "train_micro_batch_size_per_gpu": "auto",
34 |     "wall_clock_breakdown": false
35 | }
36 | 


--------------------------------------------------------------------------------
/swift/llm/ds_config/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "zero_optimization": {
16 |         "stage": 2,
17 |         "offload_optimizer": {
18 |             "device": "cpu",
19 |             "pin_memory": true
20 |         },
21 |         "allgather_partitions": true,
22 |         "allgather_bucket_size": 2e8,
23 |         "overlap_comm": false,
24 |         "reduce_scatter": true,
25 |         "reduce_bucket_size": 2e8,
26 |         "contiguous_gradients": true
27 |     },
28 | 
29 |     "gradient_accumulation_steps": "auto",
30 |     "gradient_clipping": "auto",
31 |     "steps_per_print": 2000,
32 |     "train_batch_size": "auto",
33 |     "train_micro_batch_size_per_gpu": "auto",
34 |     "wall_clock_breakdown": false
35 | }
36 | 


--------------------------------------------------------------------------------
/swift/llm/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .eval import SwiftEval, eval_main
3 | 


--------------------------------------------------------------------------------
/swift/llm/export/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .export import SwiftExport, export_main
3 | from .merge_lora import merge_lora
4 | from .ollama import export_to_ollama
5 | from .quant import quantize_model
6 | 


--------------------------------------------------------------------------------
/swift/llm/infer/infer_engine/patch.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from contextlib import contextmanager
 3 | from functools import wraps
 4 | 
 5 | from transformers import AutoConfig, AutoTokenizer, PretrainedConfig, PreTrainedTokenizerBase
 6 | 
 7 | 
 8 | @contextmanager
 9 | def patch_auto_tokenizer(tokenizer: PreTrainedTokenizerBase):
10 |     _old_from_pretrained = AutoTokenizer.from_pretrained
11 | 
12 |     @wraps(_old_from_pretrained)
13 |     def _from_pretrained(*args, **kwargs):
14 |         return tokenizer
15 | 
16 |     AutoTokenizer.from_pretrained = _from_pretrained
17 |     try:
18 |         yield
19 |     finally:
20 |         AutoTokenizer.from_pretrained = _old_from_pretrained
21 | 
22 | 
23 | @contextmanager
24 | def patch_auto_config(config: PretrainedConfig):
25 |     _old_from_pretrained = AutoConfig.from_pretrained
26 | 
27 |     @wraps(_old_from_pretrained)
28 |     def _from_pretrained(*args, **kwargs):
29 |         return (config, {}) if 'return_unused_kwargs' in kwargs else config
30 | 
31 |     AutoConfig.from_pretrained = _from_pretrained
32 |     try:
33 |         yield
34 |     finally:
35 |         AutoConfig.from_pretrained = _old_from_pretrained
36 | 


--------------------------------------------------------------------------------
/swift/llm/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from . import model
 3 | from .constant import LLMModelType, MLLMModelType, ModelType
 4 | from .model_arch import MODEL_ARCH_MAPPING, ModelArch, ModelKeys, MultiModelKeys, get_model_arch, register_model_arch
 5 | from .register import (MODEL_MAPPING, Model, ModelGroup, ModelMeta, fix_do_sample_warning, get_default_device_map,
 6 |                        get_default_torch_dtype, get_matched_model_meta, get_model_info_meta, get_model_name,
 7 |                        get_model_tokenizer, get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn,
 8 |                        load_by_unsloth, register_model)
 9 | from .utils import HfConfigFactory, ModelInfo, get_llm_model, git_clone_github, safe_snapshot_download
10 | 


--------------------------------------------------------------------------------
/swift/llm/model/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
2 |                minicpm, minimax, mistral, mllm, moonshot, mplug, openbuddy, qwen, skywork, stepfun, telechat, valley,
3 |                yi)
4 | 


--------------------------------------------------------------------------------
/swift/llm/sampling/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampling import sampling_main
2 | 


--------------------------------------------------------------------------------
/swift/llm/template/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from . import template
 3 | from .base import MaxLengthError, Template
 4 | from .constant import TemplateType
 5 | from .grounding import draw_bbox
 6 | from .register import TEMPLATE_MAPPING, get_template, get_template_meta, register_template
 7 | from .template_inputs import InferRequest, TemplateInputs
 8 | from .template_meta import TemplateMeta
 9 | from .utils import Prompt, Word, split_str_parts_by
10 | from .vision_utils import load_file, load_image
11 | 


--------------------------------------------------------------------------------
/swift/llm/template/template/__init__.py:
--------------------------------------------------------------------------------
1 | from . import (deepseek, emu3, gemma, glm, idefics3, internlm, internvl, llama, llava, llm, megrez, microsoft, minicpm,
2 |                minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen, stepfun, valley, yi)
3 | 


--------------------------------------------------------------------------------
/swift/llm/train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .pt import SwiftPt, pt_main
3 | from .rlhf import SwiftRLHF, rlhf_main
4 | from .sft import SwiftSft, sft_main
5 | from .tuner import get_multimodal_target_regex
6 | 


--------------------------------------------------------------------------------
/swift/llm/train/pt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from typing import List, Union
 3 | 
 4 | from swift.utils import get_logger
 5 | from ..argument import TrainArguments
 6 | from .sft import SwiftSft
 7 | 
 8 | logger = get_logger()
 9 | 
10 | 
11 | class SwiftPt(SwiftSft):
12 |     args_class = TrainArguments
13 |     args: args_class
14 | 
15 |     def _prepare_template(self) -> None:
16 |         self.args.use_chat_template = False
17 |         self.args.loss_scale = 'all'
18 |         logger.info('Setting args.use_chat_template: False')
19 |         logger.info("Setting args.loss_scale: 'all'")
20 |         super()._prepare_template()
21 | 
22 | 
23 | def pt_main(args: Union[List[str], TrainArguments, None] = None):
24 |     return SwiftPt(args).main()
25 | 


--------------------------------------------------------------------------------
/swift/megatron/argument/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .megatron_args import MegatronArguments
3 | from .train_args import MegatronTrainArguments
4 | 


--------------------------------------------------------------------------------
/swift/megatron/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from . import gpt
3 | from .constant import MegatronModelType
4 | from .register import MegatronModelMeta, get_megatron_model_meta, register_megatron_model
5 | 


--------------------------------------------------------------------------------
/swift/megatron/model/constant.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | class MegatronModelType:
3 |     gpt = 'gpt'
4 | 


--------------------------------------------------------------------------------
/swift/megatron/model/gpt/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from ..config import convert_hf_config
 4 | 
 5 | 
 6 | def convert_gpt_hf_config(config) -> Dict[str, Any]:
 7 |     res = convert_hf_config(config)
 8 |     model_type = res.get('model_type')
 9 |     if model_type in {'qwen3', 'qwen3_moe'}:
10 |         res['qk_layernorm'] = True
11 |     if model_type in {'qwen2_moe', 'qwen3_moe'}:
12 |         res.pop('ffn_hidden_size', None)
13 |     return res
14 | 


--------------------------------------------------------------------------------
/swift/megatron/train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .pt import megatron_pt_main
3 | from .sft import megatron_sft_main
4 | 


--------------------------------------------------------------------------------
/swift/megatron/train/patcher.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from contextlib import contextmanager
 3 | 
 4 | from megatron.training import get_args, global_vars, initialize, training
 5 | 
 6 | 
 7 | @contextmanager
 8 | def patch_megatron_data_collator(data_collator):
 9 |     origin_build_pretraining_data_loader = training.build_pretraining_data_loader
10 | 
11 |     def build_pretraining_data_loader(*_args, **kwargs):
12 |         args = get_args()
13 |         res = origin_build_pretraining_data_loader(*_args, **kwargs)
14 |         if res is not None and args.dataloader_type != 'external':
15 |             res.collate_fn = data_collator
16 |         return res
17 | 
18 |     training.build_pretraining_data_loader = build_pretraining_data_loader
19 |     try:
20 |         yield
21 |     finally:
22 |         training.build_pretraining_data_loader = origin_build_pretraining_data_loader
23 | 


--------------------------------------------------------------------------------
/swift/megatron/train/pt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from typing import List, Union
 3 | 
 4 | from swift.utils import get_logger
 5 | from ..argument import MegatronTrainArguments
 6 | from .sft import MegatronSft
 7 | 
 8 | logger = get_logger()
 9 | 
10 | 
11 | class MegatronPt(MegatronSft):
12 |     args_class = MegatronTrainArguments
13 |     args: args_class
14 | 
15 |     def _prepare_template(self) -> None:
16 |         self.args.use_chat_template = False
17 |         self.args.loss_scale = 'all'
18 |         logger.info('Setting args.use_chat_template: False')
19 |         logger.info("Setting args.loss_scale: 'all'")
20 |         super()._prepare_template()
21 | 
22 | 
23 | def megatron_pt_main(args: Union[List[str], MegatronTrainArguments, None] = None):
24 |     return MegatronPt(args).main()
25 | 


--------------------------------------------------------------------------------
/swift/megatron/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 
3 | from .convert import convert_hf2mcore, convert_mcore2hf
4 | from .patcher import patch_megatron_tokenizer
5 | 


--------------------------------------------------------------------------------
/swift/megatron/utils/patcher.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
 3 | from megatron.training import get_args, global_vars, initialize, training
 4 | 
 5 | from swift.utils import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | 
10 | def patch_megatron_tokenizer(tokenizer):
11 | 
12 |     def build_tokenizer(args):
13 |         return tokenizer
14 | 
15 |     global_vars.build_tokenizer = build_tokenizer
16 | 
17 | 
18 | def patch_torch_dist_shard(thread_count):
19 |     __init__ = TorchDistSaveShardedStrategy.__init__
20 | 
21 |     def __new_init__(*args, **kwargs):
22 |         kwargs['thread_count'] = thread_count
23 |         return __init__(*args, **kwargs)
24 | 
25 |     TorchDistSaveShardedStrategy.__init__ = __new_init__
26 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/__init__.py:
--------------------------------------------------------------------------------
1 | from .loss_scale import loss_scale_map
2 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/config/agentflan.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "response":{
 3 |         "Name:": [1.0, 3.0],
 4 |         "Action:": [1.0, 3.0],
 5 |         "ACTION:": [1.0,3.0],
 6 |         "Tool:": [1.0, 3.0],
 7 |         "Command": [1.0, 3.0],
 8 |         "Arguments:": [1.0, 3.0],
 9 |         "action input": [1.0, 3.0],
10 |         "ACTION_INPUT:":[1.0, 3.0],
11 |         "Action Input:": [1.0, 3.0],
12 |         "Thought:": [1.0, 1.0],
13 |         "Final Answer:": [1.0, 1.0],
14 |         "Observation:": [2.0, 0.0]
15 |     },
16 |     "query":{
17 |         "What is the tool you want to use": [3.0],
18 |         "What are the required parameter names": [3.0],
19 |         "What is the value of": [3.0],
20 |         "What are the required parameter names for this tool": [3.0]
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/config/alpha_umi.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Action:": [2.0, 2.0],
3 |     "Action Input:": [2.0, 2.0],
4 |     "Thought:": [1.0, 1.0],
5 |     "Final Answer:": [1.0, 1.0],
6 |     "Observation:": [2.0, 0.0],
7 |     "Next:": [2,0, 2.0]
8 | }
9 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/config/hermes.json:
--------------------------------------------------------------------------------
1 | {
2 |     "<tool_call>.+?</tool_call>": [2.0]
3 | }
4 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/config/ignore_empty_think.json:
--------------------------------------------------------------------------------
1 | {
2 |     "<think>\\s*</think>\\s*": [0.0]
3 | }
4 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/config/qwen.json:
--------------------------------------------------------------------------------
1 | {
2 |     "✿FUNCTION✿:": [2.0, 2.0],
3 |     "✿ARGS✿:": [2.0, 2.0],
4 |     "✿RETURN✿:": [1.0, 1.0],
5 |     "✿RESULT✿:": [2.0, 0.0]
6 | }
7 | 


--------------------------------------------------------------------------------
/swift/plugin/loss_scale/config/react.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Action:": [2.0, 2.0],
3 |     "Action Input:": [2.0, 2.0],
4 |     "Thought:": [1.0, 1.0],
5 |     "Final Answer:": [1.0, 1.0],
6 |     "Observation:": [2.0, 0.0]
7 | }
8 | 


--------------------------------------------------------------------------------
/swift/trainers/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/swift/trainers/optimizers/galore/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from swift.utils.import_utils import _LazyModule
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from .utils import create_optimizer_and_scheduler, GaLoreConfig
 9 |     from .adafactor import GaLoreAdafactor
10 |     from .adamw8bit import GaLoreAdamW8bit
11 |     from .adamw import GaLoreAdamW
12 | else:
13 |     _import_structure = {
14 |         'utils': ['GaLoreConfig', 'create_optimizer_and_scheduler'],
15 |         'adafactor': ['GaLoreAdafactor'],
16 |         'adamw8bit': ['GaLoreAdamW8bit'],
17 |         'adamw': ['GaLoreAdamW'],
18 |     }
19 | 
20 |     import sys
21 | 
22 |     sys.modules[__name__] = _LazyModule(
23 |         __name__,
24 |         globals()['__file__'],
25 |         _import_structure,
26 |         module_spec=__spec__,
27 |         extra_objects={},
28 |     )
29 | 


--------------------------------------------------------------------------------
/swift/trainers/rlhf_trainer/orpo_trainer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | from typing import Optional, Union
 3 | 
 4 | import torch.nn as nn
 5 | from transformers import PreTrainedModel
 6 | from trl import ORPOTrainer as HFORPOTrainer
 7 | 
 8 | from ..mixin import SwiftMixin
 9 | from .rlhf_mixin import RLHFTrainerMixin
10 | 
11 | del HFORPOTrainer.__init__
12 | 
13 | 
14 | class ORPOTrainer(RLHFTrainerMixin, SwiftMixin, HFORPOTrainer):
15 | 
16 |     def __init__(self, model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, *_args, **kwargs):
17 |         ref_model = kwargs.get('ref_model')
18 |         assert ref_model is None, 'ORPO does not require a ref_model.'
19 |         super().__init__(model, *_args, **kwargs)
20 | 


--------------------------------------------------------------------------------
/swift/trainers/sequence_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | if os.environ.get('SEQUENCE_PARALLEL_IMPL', 'ulysses') == 'xtuner':
4 |     from .xtuner import XTuner
5 |     sequence_parallel = XTuner()
6 | else:
7 |     from .ulysses import Ulysses
8 |     sequence_parallel = Ulysses()
9 | 


--------------------------------------------------------------------------------
/swift/trainers/sequence_parallel/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from abc import abstractmethod
 3 | 
 4 | 
 5 | class SequenceParallel(abc.ABC):
 6 | 
 7 |     @abstractmethod
 8 |     def init_sequence_parallel(self, size):
 9 |         pass
10 | 
11 |     @abstractmethod
12 |     def prepare_model(self, model, tokenizer):
13 |         pass
14 | 
15 |     @abstractmethod
16 |     def pad_and_split_inputs(self,
17 |                              input_ids,
18 |                              input_embeds,
19 |                              labels,
20 |                              position_ids,
21 |                              attention_mask,
22 |                              loss_scale,
23 |                              embed_tokens=None):
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def reduce_outputs(self, loss, labels):
28 |         pass
29 | 
30 |     @property
31 |     def sp_group(self):
32 |         return None
33 | 
34 |     @abstractmethod
35 |     def world_size(self):
36 |         pass
37 | 
38 |     @abstractmethod
39 |     def prepare_trainer(self, trainer):
40 |         pass
41 | 
42 |     @abstractmethod
43 |     def get_dataloader(self, trainer, dataset, batch_size):
44 |         pass
45 | 


--------------------------------------------------------------------------------
/swift/tuners/longlora/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/swift/tuners/scetuning/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .scetuning import SCETuning, SCETuningConfig
3 | 


--------------------------------------------------------------------------------
/swift/ui/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from .app import webui_main
3 | 


--------------------------------------------------------------------------------
/swift/ui/llm_eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/swift/ui/llm_export/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/swift/ui/llm_grpo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/swift/ui/llm_grpo/__init__.py


--------------------------------------------------------------------------------
/swift/ui/llm_grpo/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | from swift.ui.llm_train.model import Model as TrainModel
3 | 
4 | 
5 | class Model(TrainModel):
6 |     group = 'llm_grpo'
7 | 


--------------------------------------------------------------------------------
/swift/ui/llm_infer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/swift/ui/llm_train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/swift/utils/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba, Inc. and its affiliates.
 2 | 
 3 | BIN_EXTENSIONS = [
 4 |     '.*.bin',
 5 |     '.*.ts',
 6 |     '.*.pt',
 7 |     '.*.data-00000-of-00001',
 8 |     '.*.onnx',
 9 |     '.*.meta',
10 |     '.*.pb',
11 |     '.*.index',
12 | ]
13 | 
14 | PEFT_TYPE_KEY = 'peft_type'
15 | SWIFT_TYPE_KEY = 'swift_type'
16 | DEFAULT_ADAPTER = 'default'
17 | 
18 | 
19 | class Invoke(object):
20 |     KEY = 'invoked_by'
21 |     THIRD_PARTY = 'third_party'
22 |     PRETRAINED = 'from_pretrained'
23 |     PIPELINE = 'pipeline'
24 |     TRAINER = 'trainer'
25 |     LOCAL_TRAINER = 'local_trainer'
26 |     PREPROCESSOR = 'preprocessor'
27 |     SWIFT = 'swift'
28 | 


--------------------------------------------------------------------------------
/swift/version.py:
--------------------------------------------------------------------------------
1 | # Make sure to modify __release_datetime__ to release time when making official release.
2 | __version__ = '3.5.0.dev0'
3 | # default release datetime for branches under active development is set
4 | # to be a time far-far-away-into-the-future
5 | __release_datetime__ = '2099-10-13 08:56:12'
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/__init__.py


--------------------------------------------------------------------------------
/tests/app/test_app.py:
--------------------------------------------------------------------------------
 1 | def test_llm():
 2 |     from swift.llm import app_main, AppArguments
 3 |     app_main(AppArguments(model='Qwen/Qwen2.5-0.5B-Instruct'))
 4 | 
 5 | 
 6 | def test_lora():
 7 |     from swift.llm import app_main, AppArguments
 8 |     app_main(AppArguments(adapters='swift/test_lora', lang='en', studio_title='小黄'))
 9 | 
10 | 
11 | def test_mllm():
12 |     from swift.llm import app_main, AppArguments
13 |     app_main(AppArguments(model='Qwen/Qwen2-VL-7B-Instruct', stream=True))
14 | 
15 | 
16 | def test_audio():
17 |     from swift.llm import AppArguments, app_main, DeployArguments, run_deploy
18 |     deploy_args = DeployArguments(model='Qwen/Qwen2-Audio-7B-Instruct', infer_backend='pt', verbose=False)
19 | 
20 |     with run_deploy(deploy_args, return_url=True) as url:
21 |         app_main(AppArguments(model='Qwen2-Audio-7B-Instruct', base_url=url, stream=True))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     test_mllm()
26 | 


--------------------------------------------------------------------------------
/tests/general/test_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | from swift.utils import get_device
 6 | 
 7 | os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
 8 | 
 9 | 
10 | def test_qwen2():
11 |     import os
12 |     from swift.llm import get_model_tokenizer
13 |     model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', load_model=False)
14 |     print(f'model: {model}, tokenizer: {tokenizer}')
15 |     # test hf
16 |     model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', load_model=False, use_hf=True)
17 | 
18 |     model, tokenizer = get_model_tokenizer(
19 |         'Qwen/Qwen2-7B-Instruct', torch.float32, device_map=get_device(), attn_impl='flash_attn')
20 |     print(f'model: {model}, tokenizer: {tokenizer}')
21 | 
22 | 
23 | def test_modelscope_hub():
24 |     from swift.llm import get_model_tokenizer
25 |     model, tokenizer = get_model_tokenizer('Qwen/Qwen2___5-Math-1___5B-Instruct/', load_model=False)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     test_qwen2()
30 |     # test_modelscope_hub()
31 | 


--------------------------------------------------------------------------------
/tests/general/test_stream.py:
--------------------------------------------------------------------------------
 1 | from swift.llm import load_dataset
 2 | 
 3 | 
 4 | def test_local_dataset():
 5 |     # please use git clone
 6 |     from swift.llm import git_clone_github
 7 |     model_dir = git_clone_github('https://www.modelscope.cn/datasets/swift/swift-sft-mixture.git')
 8 |     dataset = load_dataset(datasets=[f'{model_dir}:firefly'], streaming=True)[0]
 9 |     print(next(iter(dataset)))
10 | 
11 | 
12 | def test_hub_dataset():
13 |     local_dataset = 'swift/swift-sft-mixture:firefly'
14 |     dataset = load_dataset(datasets=[local_dataset], streaming=True)[0]
15 |     print(next(iter(dataset)))
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     test_local_dataset()
20 |     # test_hub_dataset()
21 | 


--------------------------------------------------------------------------------
/tests/hub/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/hub/__init__.py


--------------------------------------------------------------------------------
/tests/hub/test_check_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import tempfile
 4 | import unittest
 5 | 
 6 | from modelscope import Model, check_local_model_is_latest
 7 | 
 8 | 
 9 | class TestCheckModel(unittest.TestCase):
10 | 
11 |     def setUp(self):
12 |         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
13 |         self.tmp_dir = tempfile.TemporaryDirectory().name
14 |         if not os.path.exists(self.tmp_dir):
15 |             os.makedirs(self.tmp_dir)
16 | 
17 |     def tearDown(self):
18 |         import peft
19 |         shutil.rmtree(self.tmp_dir)
20 |         super().tearDown()
21 | 
22 |     def test_check_model(self):
23 |         model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base', revision='v1.0.0')
24 |         self.assertFalse(check_local_model_is_latest(model.model_dir))
25 | 


--------------------------------------------------------------------------------
/tests/infer/test_agent.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 6 | 
 7 | kwargs = {
 8 |     'per_device_train_batch_size': 2,
 9 |     'save_steps': 50,
10 |     'gradient_accumulation_steps': 4,
11 |     'num_train_epochs': 1,
12 | }
13 | 
14 | 
15 | def test_sft():
16 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
17 |     from swift.llm import sft_main, TrainArguments, infer_main, InferArguments
18 |     sft_main(
19 |         TrainArguments(model='Qwen/Qwen2-7B-Instruct', dataset=['iic/ms_agent#2000'], loss_scale='react', **kwargs))
20 | 
21 | 
22 | def test_infer():
23 |     from swift.llm import infer_main, InferArguments
24 |     ckpt_dir = 'output/Qwen2-7B-Instruct/v229-20241126-133152/checkpoint-100'
25 |     infer_main(InferArguments(ckpt_dir=ckpt_dir))
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     test_sft()
30 |     # test_infer()
31 | 


--------------------------------------------------------------------------------
/tests/infer/test_max_memory.py:
--------------------------------------------------------------------------------
 1 | from swift.llm import InferArguments, infer_main
 2 | 
 3 | 
 4 | def test_max_memory():
 5 |     infer_main(
 6 |         InferArguments(model='Qwen/Qwen2.5-7B-Instruct', max_memory='{0: "50GB", 1: "5GB"}', device_map='sequential'))
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     test_max_memory()
11 | 


--------------------------------------------------------------------------------
/tests/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/llm/__init__.py


--------------------------------------------------------------------------------
/tests/llm/config/infer.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ckpt_dir": "/mnt/workspace/yzhao/modelscope/swift/output/pai_test/checkpoint-6",
3 |     "val_dataset_sample": 2,
4 |     "load_dataset_config": true
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/llm/config/sft.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "qwen-1_8b-chat",
3 |     "dataset": "jd-sentiment-zh",
4 |     "output_dir": "output/pai_test",
5 |     "train_dataset_sample": 100,
6 |     "eval_steps": 5
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/llm/data/alpaca.csv:
--------------------------------------------------------------------------------
1 | system,instruction,input,output
2 | 00000,11111,22222,3.3
3 | ,aaaaa,,ccccc
4 | ,AAAAA,BBBBB,CCCCC
5 | 


--------------------------------------------------------------------------------
/tests/llm/data/alpaca.jsonl:
--------------------------------------------------------------------------------
1 | {"instruction": "11111", "input": "22222", "output": "33333", "history": [["aaaaa", "bbbbb"]], "system": "system123"}
2 | {"instruction": "aaaaa", "output": "ccccc"}
3 | {"instruction": "AAAAA", "input": "BBBBB", "output": "CCCCC"}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/alpaca2.csv:
--------------------------------------------------------------------------------
1 | instruction,output
2 | 11111,33333
3 | aaaaa,ccccc
4 | AAAAA,CCCCC
5 | 


--------------------------------------------------------------------------------
/tests/llm/data/chatml.jsonl:
--------------------------------------------------------------------------------
1 | {"messages": [{"role": "system", "content": "00000"}, {"role": "user", "content": "11111"}, {"role": "assistant", "content": "22222"}]}
2 | {"messages": [{"role": "user", "content": "aaaaa"}, {"role": "assistant", "content": "bbbbb"}, {"role": "user", "content": "ccccc"}, {"role": "assistant", "content": "ddddd"}]}
3 | {"messages": [{"role": "user", "content": "AAAAA"}, {"role": "assistant", "content": "BBBBB"}, {"role": "user", "content": "CCCCC"}, {"role": "assistant", "content": "DDDDD"}]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/conversations.jsonl:
--------------------------------------------------------------------------------
1 | {"conversations": [{"from": "system", "value": "00000"}, {"from": "user", "value": "11111"}, {"from": "assistant", "value": "22222"}]}
2 | {"conversations": [{"from": "user", "value": "aaaaa"}, {"from": "assistant", "value": "bbbbb"}, {"from": "user", "value": "ccccc"}, {"from": "assistant", "value": "ddddd"}]}
3 | {"conversations": [{"from": "user", "value": "AAAAA"}, {"from": "assistant", "value": "BBBBB"}, {"from": "user", "value": "CCCCC"}, {"from": "assistant", "value": "DDDDD"}]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/multi_modal_1.jsonl:
--------------------------------------------------------------------------------
1 | {"query": "<img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>55555", "response": "66666"}
2 | {"query": "<img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img><img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>eeeee", "response": "fffff", "history": [["hello", "123"]]}
3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/multi_modal_2.jsonl:
--------------------------------------------------------------------------------
1 | {"query": "55555", "response": "66666", "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]}
2 | {"query": "eeeee", "response": "fffff", "history": [], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]}
3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/multi_modal_3.jsonl:
--------------------------------------------------------------------------------
1 | {"query": "55555", "response": "66666", "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]}
2 | {"query": "eeeee", "response": "fffff", "history": [], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]}
3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "images": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/sharegpt.jsonl:
--------------------------------------------------------------------------------
1 | {"system": "00000", "conversation": [{"human": "11111", "assistant": "22222"}]}
2 | {"conversation": [{"human": "aaaaa", "assistant": "bbbbb"}]}
3 | {"conversation": [{"human": "AAAAA", "assistant": "BBBBB"}, {"human": "CCCCC", "assistant": "DDDDD"}, {"human": "EEEEE", "assistant": "FFFFF"}]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/swift_multi.json:
--------------------------------------------------------------------------------
1 | [{"system": "00000", "query": "55555", "response": "66666"},
2 | {"query": "eeeee", "response": "fffff", "history": []},
3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]}]
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/swift_multi.jsonl:
--------------------------------------------------------------------------------
1 | {"system": "00000", "query": "55555", "response": "66666"}
2 | {"query": "eeeee", "response": "fffff", "history": []}
3 | {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/swift_pre.csv:
--------------------------------------------------------------------------------
1 | response
2 | 11111
3 | aaaaa
4 | AAAAA
5 | 


--------------------------------------------------------------------------------
/tests/llm/data/swift_pre.jsonl:
--------------------------------------------------------------------------------
1 | {"response": "11111"}
2 | {"response": "aaaaa"}
3 | {"response": "AAAAA"}
4 | 


--------------------------------------------------------------------------------
/tests/llm/data/swift_single.csv:
--------------------------------------------------------------------------------
1 | system,query,response
2 | 00000,11111,22222
3 | ,aaaaa,bbbbb
4 | ,AAAAA,BBBBB
5 | 


--------------------------------------------------------------------------------
/tests/llm/data/swift_single.jsonl:
--------------------------------------------------------------------------------
1 | {"system": "00000", "query": "11111", "response": "22222"}
2 | {"query": "aaaaa", "response": "bbbbb"}
3 | {"query": "AAAAA", "response": "BBBBB"}
4 | 


--------------------------------------------------------------------------------
/tests/llm/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from swift.llm import load_dataset
 4 | 
 5 | 
 6 | class TestDataset(unittest.TestCase):
 7 | 
 8 |     def test_load_v_dataset(self):
 9 |         if not __name__ == '__main__':
10 |             # ignore citest error in github
11 |             return
12 | 
13 |         for ds in ['m3it#1000', 'mantis-instruct#1000', 'llava-med-zh-instruct#1000']:
14 |             ds = load_dataset(ds)
15 |             assert len(ds[0]) > 800
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     unittest.main()
20 | 


--------------------------------------------------------------------------------
/tests/llm/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from swift.llm import load_dataset
 4 | from swift.utils import lower_bound
 5 | 
 6 | 
 7 | class TestLlmUtils(unittest.TestCase):
 8 | 
 9 |     def test_count_startswith(self):
10 |         arr = [-100] * 1000 + list(range(1000))
11 |         self.assertTrue(lower_bound(0, len(arr), lambda i: arr[i] != -100) == 1000)
12 | 
13 |     def test_count_endswith(self):
14 |         arr = list(range(1000)) + [-100] * 1000
15 |         self.assertTrue(lower_bound(0, len(arr), lambda i: arr[i] == -100) == 1000)
16 | 
17 |     @unittest.skip('avoid ci error')
18 |     def test_dataset(self):
19 |         dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#200'],
20 |                                num_proc=4,
21 |                                strict=False,
22 |                                download_mode='force_redownload')
23 |         print(f'dataset[0]: {dataset[0]}')
24 |         print(f'dataset[1]: {dataset[1]}')
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/tests/models/test_flash_attn.py:
--------------------------------------------------------------------------------
1 | from swift.llm import get_model_tokenizer
2 | 
3 | if __name__ == '__main__':
4 |     # model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', attn_impl='flash_attn')
5 |     # model, tokenizer = get_model_tokenizer('AIDC-AI/Ovis2-2B', attn_impl='flash_attn')
6 |     # model, tokenizer = get_model_tokenizer('OpenGVLab/InternVL2-2B', attn_impl='flash_attn')
7 |     model, tokenizer = get_model_tokenizer('Shanghai_AI_Laboratory/internlm3-8b-instruct', attn_impl='flash_attn')
8 |     print(model)
9 | 


--------------------------------------------------------------------------------
/tests/models/test_llm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
 4 | 
 5 | 
 6 | def test_llama3():
 7 |     from swift.llm import infer_main, InferArguments
 8 |     infer_main(
 9 |         InferArguments(
10 |             model='LLM-Research/Meta-Llama-3.1-8B-Instruct',
11 |             max_batch_size=2,
12 |             val_dataset='AI-ModelScope/alpaca-gpt4-data-en#2'))
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     test_llama3()
17 | 


--------------------------------------------------------------------------------
/tests/models/test_mllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 4 | 
 5 | 
 6 | def test_cogvlm():
 7 |     from swift.llm import infer_main, InferArguments, sft_main, TrainArguments
 8 |     # infer_main(InferArguments(model='ZhipuAI/cogvlm2-video-llama3-chat'))
 9 |     sft_main(
10 |         TrainArguments(
11 |             model='ZhipuAI/cogvlm2-video-llama3-chat',
12 |             dataset=['AI-ModelScope/alpaca-gpt4-data-zh#200', 'swift/VideoChatGPT:Generic#200']))
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     test_cogvlm()
17 | 


--------------------------------------------------------------------------------
/tests/run_config.yaml:
--------------------------------------------------------------------------------
1 | # isolate cases in env, we can install different dependencies in each env.
2 | isolated:  # test cases that may require excessive amount of GPU memory or run long time, which will be executed in dedicated process.
3 | 
4 | envs:
5 |   default: # default env, case not in other env will in default, pytorch.
6 |     dependencies: # requirement packages，pip install before test case run.
7 |       - numpy>=1.20,<=1.22.0
8 |       - protobuf<4,>=3.20.2
9 | 


--------------------------------------------------------------------------------
/tests/test_align/test_rlhf_loss.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/test_align/test_rlhf_loss.py


--------------------------------------------------------------------------------
/tests/test_align/test_template/test_gene.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 6 | os.environ['SWIFT_DEBUG'] = '1'
 7 | 
 8 | 
 9 | def test_deepseek_janus_pro_gene():
10 |     from swift.llm import infer_main, InferArguments
11 |     args = InferArguments(model='deepseek-ai/Janus-Pro-1B', infer_backend='pt')
12 |     infer_main(args)
13 | 
14 | 
15 | def test_emu3_gen(infer_backend):
16 |     from swift.llm import infer_main, InferArguments
17 |     args = InferArguments(
18 |         model='BAAI/Emu3-Gen',
19 |         infer_backend=infer_backend,
20 |         stream=False,
21 |         use_chat_template=False,
22 |         top_k=2048,
23 |         max_new_tokens=40960)
24 |     infer_main(args)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # test_emu3_gen('pt')
29 |     test_deepseek_janus_pro_gene()
30 | 


--------------------------------------------------------------------------------
/tests/train/test_grounding.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | from swift.llm import TrainArguments, sft_main
4 | 
5 | os.environ['MAX_PIXELS'] = str(16 * 28 * 28)
6 | 
7 | if __name__ == '__main__':
8 |     sft_main(TrainArguments(model='Qwen/Qwen2.5-VL-7B-Instruct', dataset='AI-ModelScope/coco#2000'))
9 | 


--------------------------------------------------------------------------------
/tests/train/test_sample.py:
--------------------------------------------------------------------------------
 1 | from swift.llm import SamplingArguments, sampling_main
 2 | 
 3 | 
 4 | def test_sampling():
 5 |     sampling_main(
 6 |         SamplingArguments(
 7 |             model='LLM-Research/Meta-Llama-3.1-8B-Instruct',
 8 |             sampler_engine='pt',
 9 |             num_return_sequences=5,
10 |             dataset='AI-ModelScope/alpaca-gpt4-data-zh#5'))
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     test_sampling()
15 | 


--------------------------------------------------------------------------------
/tests/train/test_train_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | kwargs = {
 4 |     'per_device_train_batch_size': 5,
 5 |     'save_steps': 5,
 6 |     'gradient_accumulation_steps': 1,
 7 |     'num_train_epochs': 1,
 8 | }
 9 | 
10 | 
11 | def test_train_eval_loop():
12 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0,2'
13 |     from swift.llm import sft_main, TrainArguments
14 |     sft_main(
15 |         TrainArguments(
16 |             model='Qwen/Qwen2.5-0.5B-Instruct',
17 |             dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100'],
18 |             target_modules=['all-linear', 'all-embedding'],
19 |             modules_to_save=['all-embedding', 'all-norm'],
20 |             eval_strategy='steps',
21 |             eval_steps=5,
22 |             per_device_eval_batch_size=5,
23 |             eval_use_evalscope=True,
24 |             eval_datasets=['gsm8k'],
25 |             eval_datasets_args={'gsm8k': {
26 |                 'few_shot_num': 0
27 |             }},
28 |             eval_limit=10,
29 |             report_to=['wandb'],
30 |             **kwargs))
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     test_train_eval_loop()
35 | 


--------------------------------------------------------------------------------
/tests/train/test_vit_lr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 4 | 
 5 | 
 6 | def test_vit_lr():
 7 |     # https://github.com/QwenLM/Qwen2.5-VL/tree/main/qwen-vl-finetune
 8 |     from swift.llm import sft_main, TrainArguments
 9 |     sft_main(
10 |         TrainArguments(
11 |             model='Qwen/Qwen2.5-VL-7B-Instruct',
12 |             dataset=['AI-ModelScope/LaTeX_OCR#20000'],
13 |             vit_lr=2e-5,
14 |             learning_rate=1e-5,
15 |             aligner_lr=1e-4,
16 |             freeze_llm=False,
17 |             freeze_vit=False,
18 |             freeze_aligner=False))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     test_vit_lr()
23 | 


--------------------------------------------------------------------------------
/tests/tuners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/tuners/__init__.py


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/ms-swift/23df7f3344df4b4ae4aed5ad98fcc38ba6a90ad5/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/test_split_str_parts_by.py:
--------------------------------------------------------------------------------
 1 | from swift.llm.template import split_str_parts_by
 2 | 
 3 | 
 4 | def test_split_str_parts_by():
 5 |     print(split_str_parts_by('aaaAction:bb\nbAction Inputs:\nabbb', ['Action:', 'Action Inputs:'], regex_mode=False))
 6 |     print(split_str_parts_by('aaaAction:bb\nbAction Inputs:\nabbb', ['Action:', 'Action Inputs:'], regex_mode=True))
 7 |     print(split_str_parts_by('aaa<tool_call>bbb</tool_call>ccc', ['<tool_call>.+?</tool_call>'], regex_mode=True))
 8 |     print(split_str_parts_by('aaa<image>\nbb\nb<audio>\nabbb', ['<image>', '<audio>', '<video>'], regex_mode=False))
 9 |     print(split_str_parts_by('aaa<image>\nbb\nb<audio>\nabbb', ['<image>', '<audio>', '<video>'], regex_mode=True))
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     test_split_str_parts_by()
14 | 


--------------------------------------------------------------------------------
/tests/utils/test_torch_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from modelscope import Model
 4 | 
 5 | from swift.utils.torch_utils import find_sub_module
 6 | 
 7 | 
 8 | class TestTorchUtils(unittest.TestCase):
 9 | 
10 |     def test_find_sub_module(self):
11 |         model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
12 |         self.assertTrue(find_sub_module(model, 'query') is not None)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     unittest.main()
17 | 


--------------------------------------------------------------------------------