├── .github ├── CONTRIBUTING.md └── workflows │ ├── deploy.yml │ └── lint.yml ├── .gitignore ├── .owners.yml ├── .pre-commit-config.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_zh-CN.md ├── docs ├── en │ ├── .readthedocs.yaml │ ├── Makefile │ ├── _static │ │ ├── css │ │ │ └── readthedocs.css │ │ └── image │ │ │ └── logo.png │ ├── acceleration │ │ ├── benchmark.rst │ │ ├── deepspeed.rst │ │ ├── flash_attn.rst │ │ ├── hyper_parameters.rst │ │ ├── length_grouped_sampler.rst │ │ ├── pack_to_max_length.rst │ │ ├── train_extreme_long_sequence.rst │ │ ├── train_large_scale_dataset.rst │ │ └── varlen_flash_attn.rst │ ├── chat │ │ ├── agent.md │ │ ├── llm.md │ │ ├── lmdeploy.md │ │ └── vlm.md │ ├── conf.py │ ├── dpo │ │ ├── modify_settings.md │ │ ├── overview.md │ │ └── quick_start.md │ ├── evaluation │ │ ├── hook.md │ │ ├── mmbench.md │ │ ├── mmlu.md │ │ └── opencompass.md │ ├── get_started │ │ ├── installation.md │ │ ├── overview.md │ │ └── quickstart.md │ ├── index.rst │ ├── internevo_migration │ │ ├── ftdp_dataset │ │ │ ├── Case1.rst │ │ │ ├── Case2.rst │ │ │ ├── Case3.rst │ │ │ ├── Case4.rst │ │ │ └── ftdp.rst │ │ └── internevo_migration.rst │ ├── make.bat │ ├── models │ │ └── supported.md │ ├── notes │ │ └── changelog.md │ ├── preparation │ │ ├── pretrained_model.rst │ │ └── prompt_template.rst │ ├── reward_model │ │ ├── modify_settings.md │ │ ├── overview.md │ │ ├── preference_data.md │ │ └── quick_start.md │ ├── switch_language.md │ ├── training │ │ ├── custom_agent_dataset.rst │ │ ├── custom_pretrain_dataset.rst │ │ ├── custom_sft_dataset.rst │ │ ├── modify_settings.rst │ │ ├── multi_modal_dataset.rst │ │ ├── open_source_dataset.rst │ │ └── visualization.rst │ └── user_guides │ │ ├── chat.md │ │ ├── dataset_format.md │ │ ├── dataset_prepare.md │ │ ├── finetune.md │ │ ├── incremental_pretraining.md │ │ ├── intern_repo_dataset.md │ │ ├── multi_turn_conversation.md │ │ ├── prompt_template.md │ │ └── single_turn_conversation.md └── zh_cn │ ├── .readthedocs.yaml │ ├── Makefile │ ├── _static │ └── image │ │ └── logo.png │ ├── acceleration │ ├── benchmark.rst │ ├── deepspeed.rst │ ├── flash_attn.rst │ ├── hyper_parameters.rst │ ├── length_grouped_sampler.rst │ ├── pack_to_max_length.rst │ ├── train_extreme_long_sequence.rst │ ├── train_large_scale_dataset.rst │ └── varlen_flash_attn.rst │ ├── chat │ ├── agent.md │ ├── llm.md │ ├── lmdeploy.md │ └── vlm.md │ ├── conf.py │ ├── dpo │ ├── modify_settings.md │ ├── overview.md │ └── quick_start.md │ ├── evaluation │ ├── hook.md │ ├── mmbench.md │ ├── mmlu.md │ └── opencompass.md │ ├── get_started │ ├── installation.rst │ └── quickstart.rst │ ├── index.rst │ ├── internevo_migration │ ├── differences.rst │ └── ftdp_dataset │ │ ├── processed_and_internlm2.rst │ │ ├── processed_and_others.rst │ │ ├── processed_normal_chat.rst │ │ └── tokenized_and_internlm2.rst │ ├── make.bat │ ├── models │ └── supported.md │ ├── notes │ └── changelog.md │ ├── preparation │ ├── pretrained_model.rst │ └── prompt_template.rst │ ├── reward_model │ ├── images │ │ ├── preference_data.png │ │ ├── sequence_parallel.png │ │ └── var_len_atten.png │ ├── modify_settings.md │ ├── overview.md │ ├── preference_data.md │ └── quick_start.md │ ├── switch_language.md │ ├── training │ ├── custom_pretrain_dataset.rst │ ├── custom_sft_dataset.rst │ ├── modify_settings.rst │ ├── multi_modal_dataset.rst │ ├── open_source_dataset.rst │ └── visualization.rst │ └── user_guides │ ├── ceph.md │ ├── chat.md │ ├── config.md │ ├── custom_dataset │ ├── Offline.md │ └── Online.md │ ├── dataset_format.md │ ├── dataset_prepare.md │ ├── finetune.md │ ├── ftdp_dataset │ ├── Case1.md │ ├── Case2.md │ ├── Case3.md │ ├── Case4.md │ └── README.md │ ├── incremental_pretraining.md │ ├── intern_repo_dataset.md │ ├── llava_offline.md │ ├── multi_turn_conversation.md │ ├── prompt_template.md │ ├── sequence_parallel.md │ ├── single_turn_conversation.md │ └── varlen_attention.md ├── examples ├── demo_data │ ├── multi_turn_1 │ │ ├── README.md │ │ ├── config.py │ │ ├── data.json │ │ └── map_fn.py │ ├── multi_turn_2 │ │ ├── README.md │ │ ├── config.py │ │ ├── data.json │ │ └── map_fn.py │ ├── pretrain │ │ ├── README.md │ │ ├── config.py │ │ ├── data.json │ │ └── map_fn.py │ └── single_turn │ │ ├── README.md │ │ ├── config.py │ │ ├── data.json │ │ └── map_fn.py └── huggingface_trainer │ ├── README.md │ ├── train_hf.py │ ├── train_lora_hf.py │ └── train_qlora_hf.py ├── requirements.txt ├── requirements ├── deepspeed.txt ├── docs.txt ├── lmdeploy.txt ├── modelscope.txt └── runtime.txt ├── setup.cfg ├── setup.py └── xtuner ├── __init__.py ├── _lite ├── __init__.py ├── accelerate │ ├── __init__.py │ ├── lora.py │ ├── ops │ │ ├── __init__.py │ │ └── moe_permute.py │ ├── packed.py │ └── utils.py ├── algorithms │ ├── __init__.py │ ├── ppo │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── loss.py │ │ └── model.py │ └── sft │ │ ├── __init__.py │ │ └── dataset.py ├── chat │ ├── __init__.py │ ├── backends │ │ └── __init__.py │ ├── messages │ │ ├── __init__.py │ │ ├── base.py │ │ └── chat.py │ └── templates │ │ ├── __init__.py │ │ ├── chat.py │ │ └── hybrid.py ├── datasets │ ├── __init__.py │ ├── json.py │ ├── jsonl.py │ ├── pack.py │ ├── streaming.py │ └── utils │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── load.py │ │ └── utils.py ├── device.py ├── modelings │ ├── __init__.py │ ├── internlm2 │ │ ├── __init__.py │ │ ├── configuration_internlm2.py │ │ └── modeling_internlm2.py │ ├── internlm3 │ │ ├── __init__.py │ │ ├── configuration_internlm3.py │ │ ├── modeling_internlm3.py │ │ └── tokenization_internlm3.py │ ├── internvl2 │ │ ├── __init__.py │ │ ├── configuration_intern_vit.py │ │ └── modeling_intern_vit.py │ └── llava │ │ ├── __init__.py │ │ ├── configuration_internlm2.py │ │ ├── configuration_llava.py │ │ ├── modeling_internlm2.py │ │ ├── modeling_llava.py │ │ └── processing_llava.py ├── parallel │ ├── __init__.py │ ├── comm.py │ ├── sampler.py │ ├── sequence │ │ ├── __init__.py │ │ ├── attention.py │ │ └── ops.py │ └── setup.py └── patches │ ├── __init__.py │ ├── auto.py │ ├── base.py │ ├── internlm3.py │ ├── llama.py │ ├── mixins │ ├── __init__.py │ └── generate.py │ ├── qwen2.py │ └── utils.py ├── apis ├── __init__.py ├── datasets │ ├── __init__.py │ ├── alpaca.py │ ├── arxiv.py │ ├── code_alpaca.py │ ├── colorist.py │ ├── lawyer.py │ ├── medical.py │ ├── moss_003_sft.py │ ├── oasst1.py │ ├── open_orca.py │ ├── sql.py │ ├── tiny_codes.py │ └── wizardlm.py ├── model.py └── training_args.py ├── configs ├── __init__.py ├── baichuan │ ├── baichuan2_13b_base │ │ ├── baichuan2_13b_base_qlora_alpaca_e3.py │ │ ├── baichuan2_13b_base_qlora_alpaca_enzh_e3.py │ │ ├── baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan2_13b_base_qlora_alpaca_zh_e3.py │ │ ├── baichuan2_13b_base_qlora_arxiv_gentitle_e3.py │ │ ├── baichuan2_13b_base_qlora_code_alpaca_e3.py │ │ ├── baichuan2_13b_base_qlora_colorist_e5.py │ │ ├── baichuan2_13b_base_qlora_lawyer_e3.py │ │ ├── baichuan2_13b_base_qlora_oasst1_512_e3.py │ │ ├── baichuan2_13b_base_qlora_oasst1_e3.py │ │ ├── baichuan2_13b_base_qlora_open_platypus_e3.py │ │ └── baichuan2_13b_base_qlora_sql_e3.py │ ├── baichuan2_13b_chat │ │ ├── baichuan2_13b_chat_qlora_alpaca_e3.py │ │ ├── baichuan2_13b_chat_qlora_alpaca_enzh_e3.py │ │ ├── baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan2_13b_chat_qlora_alpaca_zh_e3.py │ │ ├── baichuan2_13b_chat_qlora_code_alpaca_e3.py │ │ ├── baichuan2_13b_chat_qlora_lawyer_e3.py │ │ ├── baichuan2_13b_chat_qlora_oasst1_512_e3.py │ │ ├── baichuan2_13b_chat_qlora_oasst1_e3.py │ │ └── baichuan2_13b_chat_qlora_open_platypus_e3.py │ ├── baichuan2_7b_base │ │ ├── baichuan2_7b_base_qlora_alpaca_e3.py │ │ ├── baichuan2_7b_base_qlora_alpaca_enzh_e3.py │ │ ├── baichuan2_7b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan2_7b_base_qlora_alpaca_zh_e3.py │ │ ├── baichuan2_7b_base_qlora_arxiv_gentitle_e3.py │ │ ├── baichuan2_7b_base_qlora_code_alpaca_e3.py │ │ ├── baichuan2_7b_base_qlora_colorist_e5.py │ │ ├── baichuan2_7b_base_qlora_lawyer_e3.py │ │ ├── baichuan2_7b_base_qlora_oasst1_512_e3.py │ │ ├── baichuan2_7b_base_qlora_oasst1_e3.py │ │ ├── baichuan2_7b_base_qlora_open_platypus_e3.py │ │ └── baichuan2_7b_base_qlora_sql_e3.py │ ├── baichuan2_7b_chat │ │ ├── baichuan2_7b_chat_qlora_alpaca_e3.py │ │ ├── baichuan2_7b_chat_qlora_alpaca_enzh_e3.py │ │ ├── baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan2_7b_chat_qlora_alpaca_zh_e3.py │ │ ├── baichuan2_7b_chat_qlora_code_alpaca_e3.py │ │ ├── baichuan2_7b_chat_qlora_lawyer_e3.py │ │ ├── baichuan2_7b_chat_qlora_oasst1_512_e3.py │ │ ├── baichuan2_7b_chat_qlora_oasst1_e3.py │ │ └── baichuan2_7b_chat_qlora_open_platypus_e3.py │ ├── baichuan_13b_base │ │ ├── baichuan_13b_base_qlora_alpaca_e3.py │ │ ├── baichuan_13b_base_qlora_alpaca_enzh_e3.py │ │ ├── baichuan_13b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan_13b_base_qlora_alpaca_zh_e3.py │ │ ├── baichuan_13b_base_qlora_arxiv_gentitle_e3.py │ │ ├── baichuan_13b_base_qlora_code_alpaca_e3.py │ │ ├── baichuan_13b_base_qlora_colorist_e5.py │ │ ├── baichuan_13b_base_qlora_lawyer_e3.py │ │ ├── baichuan_13b_base_qlora_medical_e1.py │ │ ├── baichuan_13b_base_qlora_moss_sft_all_e1.py │ │ ├── baichuan_13b_base_qlora_moss_sft_all_e2_gpu8.py │ │ ├── baichuan_13b_base_qlora_moss_sft_plugins_e1.py │ │ ├── baichuan_13b_base_qlora_oasst1_512_e3.py │ │ ├── baichuan_13b_base_qlora_oasst1_e3.py │ │ ├── baichuan_13b_base_qlora_open_platypus_e3.py │ │ ├── baichuan_13b_base_qlora_openorca_e1.py │ │ ├── baichuan_13b_base_qlora_sql_e3.py │ │ └── baichuan_13b_base_qlora_tiny_codes_e1.py │ ├── baichuan_13b_chat │ │ ├── baichuan_13b_chat_qlora_alpaca_e3.py │ │ ├── baichuan_13b_chat_qlora_alpaca_enzh_e3.py │ │ ├── baichuan_13b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan_13b_chat_qlora_alpaca_zh_e3.py │ │ ├── baichuan_13b_chat_qlora_arxiv_gentitle_e3.py │ │ ├── baichuan_13b_chat_qlora_code_alpaca_e3.py │ │ ├── baichuan_13b_chat_qlora_colorist_e5.py │ │ ├── baichuan_13b_chat_qlora_lawyer_e3.py │ │ ├── baichuan_13b_chat_qlora_medical_e1.py │ │ ├── baichuan_13b_chat_qlora_oasst1_512_e3.py │ │ ├── baichuan_13b_chat_qlora_oasst1_e3.py │ │ ├── baichuan_13b_chat_qlora_open_platypus_e3.py │ │ ├── baichuan_13b_chat_qlora_openorca_e1.py │ │ ├── baichuan_13b_chat_qlora_sql_e3.py │ │ └── baichuan_13b_chat_qlora_tiny_codes_e1.py │ └── baichuan_7b │ │ ├── baichuan_7b_qlora_alpaca_e3.py │ │ ├── baichuan_7b_qlora_alpaca_enzh_e3.py │ │ ├── baichuan_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── baichuan_7b_qlora_alpaca_zh_e3.py │ │ ├── baichuan_7b_qlora_arxiv_gentitle_e3.py │ │ ├── baichuan_7b_qlora_code_alpaca_e3.py │ │ ├── baichuan_7b_qlora_colorist_e5.py │ │ ├── baichuan_7b_qlora_lawyer_e3.py │ │ ├── baichuan_7b_qlora_medical_e1.py │ │ ├── baichuan_7b_qlora_moss_sft_all_e1.py │ │ ├── baichuan_7b_qlora_moss_sft_all_e2_gpu8.py │ │ ├── baichuan_7b_qlora_moss_sft_plugins_e1.py │ │ ├── baichuan_7b_qlora_oasst1_512_e3.py │ │ ├── baichuan_7b_qlora_oasst1_e3.py │ │ ├── baichuan_7b_qlora_open_platypus_e3.py │ │ ├── baichuan_7b_qlora_openorca_e1.py │ │ ├── baichuan_7b_qlora_sql_e3.py │ │ └── baichuan_7b_qlora_tiny_codes_e1.py ├── chatglm │ ├── chatglm2_6b │ │ ├── chatglm2_6b_qlora_alpaca_e3.py │ │ ├── chatglm2_6b_qlora_alpaca_enzh_e3.py │ │ ├── chatglm2_6b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── chatglm2_6b_qlora_alpaca_zh_e3.py │ │ ├── chatglm2_6b_qlora_arxiv_gentitle_e3.py │ │ ├── chatglm2_6b_qlora_code_alpaca_e3.py │ │ ├── chatglm2_6b_qlora_colorist_e5.py │ │ ├── chatglm2_6b_qlora_lawyer_e3.py │ │ ├── chatglm2_6b_qlora_medical_e1.py │ │ ├── chatglm2_6b_qlora_oasst1_512_e3.py │ │ ├── chatglm2_6b_qlora_oasst1_e3.py │ │ ├── chatglm2_6b_qlora_open_platypus_e3.py │ │ ├── chatglm2_6b_qlora_openorca_e1.py │ │ ├── chatglm2_6b_qlora_sql_e3.py │ │ └── chatglm2_6b_qlora_tiny_codes_e1.py │ ├── chatglm3_6b │ │ ├── chatglm3_6b_qlora_alpaca_e3.py │ │ ├── chatglm3_6b_qlora_alpaca_enzh_e3.py │ │ ├── chatglm3_6b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── chatglm3_6b_qlora_alpaca_zh_e3.py │ │ ├── chatglm3_6b_qlora_arxiv_gentitle_e3.py │ │ ├── chatglm3_6b_qlora_code_alpaca_e3.py │ │ ├── chatglm3_6b_qlora_colorist_e5.py │ │ ├── chatglm3_6b_qlora_lawyer_e3.py │ │ ├── chatglm3_6b_qlora_medical_e1.py │ │ ├── chatglm3_6b_qlora_oasst1_512_e3.py │ │ ├── chatglm3_6b_qlora_oasst1_e3.py │ │ ├── chatglm3_6b_qlora_open_platypus_e3.py │ │ ├── chatglm3_6b_qlora_openorca_e1.py │ │ ├── chatglm3_6b_qlora_sql_e3.py │ │ └── chatglm3_6b_qlora_tiny_codes_e1.py │ └── chatglm3_6b_base │ │ ├── chatglm3_6b_base_qlora_alpaca_e3.py │ │ ├── chatglm3_6b_base_qlora_alpaca_enzh_e3.py │ │ ├── chatglm3_6b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── chatglm3_6b_base_qlora_alpaca_zh_e3.py │ │ ├── chatglm3_6b_base_qlora_arxiv_gentitle_e3.py │ │ ├── chatglm3_6b_base_qlora_code_alpaca_e3.py │ │ ├── chatglm3_6b_base_qlora_colorist_e5.py │ │ ├── chatglm3_6b_base_qlora_lawyer_e3.py │ │ ├── chatglm3_6b_base_qlora_medical_e1.py │ │ ├── chatglm3_6b_base_qlora_oasst1_512_e3.py │ │ ├── chatglm3_6b_base_qlora_oasst1_e3.py │ │ ├── chatglm3_6b_base_qlora_open_platypus_e3.py │ │ ├── chatglm3_6b_base_qlora_openorca_e1.py │ │ ├── chatglm3_6b_base_qlora_sql_e3.py │ │ └── chatglm3_6b_base_qlora_tiny_codes_e1.py ├── cohere │ ├── README.md │ └── cohere_104b │ │ └── cohere_100b_128k_sp32.py ├── custom_dataset │ ├── pretrain │ │ ├── baichuan │ │ │ ├── baichuan2_13b_base_full_custom_pretrain_e1.py │ │ │ └── baichuan2_7b_base_full_custom_pretrain_e1.py │ │ ├── chatglm │ │ │ ├── chatglm2_6b_full_custom_pretrain_e1.py │ │ │ └── chatglm3_6b_full_custom_pretrain_e1.py │ │ ├── deepseek │ │ │ └── deepseek_moe_16b_base_full_custom_pretrain_e1.py │ │ ├── gemma │ │ │ ├── gemma_2b_full_custom_pretrain_e1.py │ │ │ └── gemma_7b_full_custom_pretrain_e1.py │ │ ├── internlm │ │ │ ├── internlm2_1_8b_full_custom_pretrain_e1.py │ │ │ ├── internlm2_20b_full_custom_pretrain_e1.py │ │ │ └── internlm2_7b_full_custom_pretrain_e1.py │ │ ├── llama │ │ │ ├── llama2_70b_full_custom_pretrain_e1.py │ │ │ └── llama2_7b_full_custom_pretrain_e1.py │ │ ├── minicpm │ │ │ ├── minicpm3_4b_full_custom_pretrain_e1.py │ │ │ ├── minicpm_1b_full_custom_pretrain_e1.py │ │ │ └── minicpm_2b_full_custom_pretrain_e1.py │ │ ├── mistral │ │ │ └── mistral_7b_full_custom_pretrain_e1.py │ │ ├── mixtral │ │ │ └── mixtral_8x7b_full_custom_pretrain_e1.py │ │ ├── qwen │ │ │ ├── qwen1_5_0_5b_full_custom_pretrain_e1.py │ │ │ ├── qwen1_5_14b_full_custom_pretrain_e1.py │ │ │ ├── qwen1_5_1_8b_full_custom_pretrain_e1.py │ │ │ ├── qwen1_5_4b_full_custom_pretrain_e1.py │ │ │ ├── qwen1_5_72b_full_custom_pretrain_e1.py │ │ │ ├── qwen1_5_7b_full_custom_pretrain_e1.py │ │ │ ├── qwen_1_8b_full_custom_pretrain_e1.py │ │ │ ├── qwen_72b_full_custom_pretrain_e1.py │ │ │ └── qwen_7b_full_custom_pretrain_e1.py │ │ ├── starcoder │ │ │ └── starcoder_full_custom_pretrain_e1.py │ │ ├── yi │ │ │ ├── yi_34b_full_custom_pretrain_e1.py │ │ │ └── yi_6b_full_custom_pretrain_e1.py │ │ └── zephyr │ │ │ └── zephyr_7b_beta_full_custom_pretrain_e1.py │ └── sft │ │ ├── baichuan │ │ ├── baichuan2_13b_chat_qlora_custom_sft_e1.py │ │ ├── baichuan2_7b_chat_qlora_custom_sft_e1.py │ │ ├── baichuan_13b_chat_qlora_custom_sft_e1.py │ │ └── baichuan_7b_qlora_custom_sft_e1.py │ │ ├── chatglm │ │ ├── chatglm2_6b_qlora_custom_sft_e1.py │ │ └── chatglm3_6b_qlora_custom_sft_e1.py │ │ ├── deepseek │ │ ├── deepseek_moe_16b_chat_qlora_custom_sft_e1.py │ │ └── deepseekcoder_6_7b_instruct_qlora_custom_sft_e1.py │ │ ├── gemma │ │ ├── gemma_2b_it_qlora_custom_sft_e1.py │ │ ├── gemma_2b_qlora_custom_sft_e1.py │ │ ├── gemma_7b_it_qlora_custom_sft_e1.py │ │ └── gemma_7b_qlora_custom_sft_e1.py │ │ ├── internlm │ │ ├── internlm2_chat_1_8b_qlora_custom_sft_e1.py │ │ ├── internlm2_chat_20b_qlora_custom_sft_e1.py │ │ └── internlm2_chat_7b_qlora_custom_sft_e1.py │ │ ├── llama │ │ ├── llama2_70b_qlora_custom_sft_e1.py │ │ └── llama2_7b_chat_qlora_custom_sft_e1.py │ │ ├── minicpm │ │ ├── minicpm3_4b_chat_qlora_custom_sft_e1.py │ │ ├── minicpm_1b_full_custom_pretrain_e1.py │ │ └── minicpm_2b_full_custom_pretrain_e1.py │ │ ├── mistral │ │ └── mistral_7b_full_finetune_custom_sft_e1.py │ │ ├── mixtral │ │ └── mixtral_8x7b_instruct_qlora_custom_sft_e1.py │ │ ├── qwen │ │ ├── qwen1_5_0_5b_chat_qlora_custom_sft_e1.py │ │ ├── qwen1_5_14b_chat_qlora_custom_sft_e1.py │ │ ├── qwen1_5_1_8b_chat_qlora_custom_sft_e1.py │ │ ├── qwen1_5_4b_chat_qlora_custom_sft_e1.py │ │ ├── qwen1_5_72b_chat_qlora_custom_sft_e1.py │ │ ├── qwen1_5_7b_chat_qlora_custom_sft_e1.py │ │ ├── qwen_1_8b_chat_qlora_custom_sft_e1.py │ │ ├── qwen_72b_qlora_custom_sft_e1.py │ │ └── qwen_7b_chat_qlora_custom_sft_e1.py │ │ ├── starcoder │ │ └── starcoder_qlora_custom_sft_e1.py │ │ ├── yi │ │ ├── yi_34b_qlora_custom_sft_e1.py │ │ └── yi_6b_qlora_custom_sft_e1.py │ │ └── zephyr │ │ └── zephyr_7b_beta_qlora_custom_sft_e1.py ├── deepseek │ ├── README.md │ ├── deepseek_coder_6_7b_base │ │ └── deepseek_coder_6_7b_base_qlora_code_alpaca_e3.py │ ├── deepseek_coder_6_7b_instruct │ │ └── deepseekcoder_6_7b_instruct_qlora_code_alpaca_e3.py │ ├── deepseek_moe_16b_base │ │ ├── deepseek_moe_16b_base_full_oasst1_e3.py │ │ └── deepseek_moe_16b_base_qlora_oasst1_e3.py │ ├── deepseek_moe_16b_chat │ │ ├── deepseek_moe_16b_chat_full_oasst1_e3.py │ │ └── deepseek_moe_16b_chat_qlora_oasst1_e3.py │ ├── deepseek_v2_chat │ │ └── deepseek_v2_chat_full_alpaca_e3.py │ └── deepseek_v2_lite_chat │ │ ├── deepseek_v2_lite_chat_full_alpaca_e3.py │ │ └── deepseek_v2_lite_chat_full_alpaca_e3_32k_varlen.py ├── deepspeed │ ├── deepspeed_zero1.json │ ├── deepspeed_zero2.json │ ├── deepspeed_zero2_offload.json │ ├── deepspeed_zero3.json │ └── deepspeed_zero3_offload.json ├── dpo │ ├── internlm │ │ ├── internlm2_chat_1_8b_dpo_full.py │ │ ├── internlm2_chat_1_8b_dpo_full_varlenattn.py │ │ ├── internlm2_chat_1_8b_dpo_full_varlenattn_jsonl_dataset.py │ │ └── internlm2_chat_7b_dpo_qlora_varlenattn.py │ └── llama │ │ └── llama3_8b_instruct_dpo_qlora_varlenattn.py ├── gemma │ ├── gemma_2b │ │ ├── gemma_2b_full_alpaca_e3.py │ │ └── gemma_2b_qlora_alpaca_e3.py │ ├── gemma_2b_it │ │ ├── gemma_2b_it_full_alpaca_e3.py │ │ └── gemma_2b_it_qlora_alpaca_e3.py │ ├── gemma_7b │ │ ├── gemma_7b_full_alpaca_e3.py │ │ └── gemma_7b_qlora_alpaca_e3.py │ └── gemma_7b_it │ │ ├── gemma_7b_it_full_alpaca_e3.py │ │ └── gemma_7b_it_qlora_alpaca_e3.py ├── internlm │ ├── internlm2_1_8b │ │ ├── internlm2_1_8b_full_alpaca_e3.py │ │ └── internlm2_1_8b_qlora_alpaca_e3.py │ ├── internlm2_20b │ │ ├── internlm2_20b_full_finetune_custom_dataset_e1.py │ │ ├── internlm2_20b_qlora_alpaca_e3.py │ │ ├── internlm2_20b_qlora_arxiv_gentitle_e3.py │ │ ├── internlm2_20b_qlora_code_alpaca_e3.py │ │ ├── internlm2_20b_qlora_colorist_e5.py │ │ ├── internlm2_20b_qlora_lawyer_e3.py │ │ ├── internlm2_20b_qlora_msagent_react_e3_gpu8.py │ │ ├── internlm2_20b_qlora_oasst1_512_e3.py │ │ ├── internlm2_20b_qlora_oasst1_e3.py │ │ └── internlm2_20b_qlora_sql_e3.py │ ├── internlm2_5_chat_20b │ │ ├── internlm2_5_chat_20b_alpaca_e3.py │ │ └── internlm2_5_chat_20b_qlora_alpaca_e3.py │ ├── internlm2_5_chat_7b │ │ ├── internlm2_5_chat_7b_full_finetune_custom_dataset_e1.py │ │ ├── internlm2_5_chat_7b_qlora_alpaca_e3.py │ │ └── internlm2_5_chat_7b_qlora_oasst1_e3.py │ ├── internlm2_7b │ │ ├── internlm2_7b_full_finetune_custom_dataset_e1.py │ │ ├── internlm2_7b_full_finetune_custom_dataset_e1_sequence_parallel_4.py │ │ ├── internlm2_7b_qlora_alpaca_e3.py │ │ ├── internlm2_7b_qlora_arxiv_gentitle_e3.py │ │ ├── internlm2_7b_qlora_code_alpaca_e3.py │ │ ├── internlm2_7b_qlora_colorist_e5.py │ │ ├── internlm2_7b_qlora_json_e3.py │ │ ├── internlm2_7b_qlora_lawyer_e3.py │ │ ├── internlm2_7b_qlora_msagent_react_e3_gpu8.py │ │ ├── internlm2_7b_qlora_oasst1_512_e3.py │ │ ├── internlm2_7b_qlora_oasst1_e3.py │ │ ├── internlm2_7b_qlora_sql_e3.py │ │ ├── internlm2_7b_w_internevo_dataset.py │ │ ├── internlm2_7b_w_tokenized_dataset.py │ │ └── internlm2_7b_w_untokenized_dataset.py │ ├── internlm2_chat_1_8b │ │ ├── internlm2_chat_1_8b_full_alpaca_e3.py │ │ └── internlm2_chat_1_8b_qlora_alpaca_e3.py │ ├── internlm2_chat_20b │ │ ├── internlm2_chat_20b_full_finetune_custom_dataset_e1.py │ │ ├── internlm2_chat_20b_qlora_alpaca_e3.py │ │ ├── internlm2_chat_20b_qlora_code_alpaca_e3.py │ │ ├── internlm2_chat_20b_qlora_lawyer_e3.py │ │ ├── internlm2_chat_20b_qlora_oasst1_512_e3.py │ │ └── internlm2_chat_20b_qlora_oasst1_e3.py │ ├── internlm2_chat_7b │ │ ├── internlm2_chat_7b_full_finetune_custom_dataset_e1.py │ │ ├── internlm2_chat_7b_qlora_alpaca_e3.py │ │ ├── internlm2_chat_7b_qlora_code_alpaca_e3.py │ │ ├── internlm2_chat_7b_qlora_lawyer_e3.py │ │ ├── internlm2_chat_7b_qlora_oasst1_512_e3.py │ │ └── internlm2_chat_7b_qlora_oasst1_e3.py │ ├── internlm_20b │ │ ├── internlm_20b_qlora_alpaca_e3.py │ │ ├── internlm_20b_qlora_alpaca_enzh_e3.py │ │ ├── internlm_20b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── internlm_20b_qlora_alpaca_zh_e3.py │ │ ├── internlm_20b_qlora_arxiv_gentitle_e3.py │ │ ├── internlm_20b_qlora_code_alpaca_e3.py │ │ ├── internlm_20b_qlora_colorist_e5.py │ │ ├── internlm_20b_qlora_lawyer_e3.py │ │ ├── internlm_20b_qlora_msagent_react_e3_gpu8.py │ │ ├── internlm_20b_qlora_oasst1_512_e3.py │ │ ├── internlm_20b_qlora_oasst1_e3.py │ │ ├── internlm_20b_qlora_open_platypus_e3.py │ │ └── internlm_20b_qlora_sql_e3.py │ ├── internlm_7b │ │ ├── internlm_7b_full_alpaca_e3.py │ │ ├── internlm_7b_full_alpaca_enzh_e3.py │ │ ├── internlm_7b_full_alpaca_enzh_oasst1_e3.py │ │ ├── internlm_7b_full_alpaca_zh_e3.py │ │ ├── internlm_7b_full_intern_repo_dataset_template.py │ │ ├── internlm_7b_full_oasst1_e3.py │ │ ├── internlm_7b_qlora_alpaca_e3.py │ │ ├── internlm_7b_qlora_alpaca_enzh_e3.py │ │ ├── internlm_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── internlm_7b_qlora_alpaca_zh_e3.py │ │ ├── internlm_7b_qlora_arxiv_gentitle_e3.py │ │ ├── internlm_7b_qlora_code_alpaca_e3.py │ │ ├── internlm_7b_qlora_colorist_e5.py │ │ ├── internlm_7b_qlora_json_e3.py │ │ ├── internlm_7b_qlora_lawyer_e3.py │ │ ├── internlm_7b_qlora_medical_e1.py │ │ ├── internlm_7b_qlora_moss_sft_all_e1.py │ │ ├── internlm_7b_qlora_moss_sft_all_e2_gpu8.py │ │ ├── internlm_7b_qlora_moss_sft_plugins_e1.py │ │ ├── internlm_7b_qlora_msagent_react_e3_gpu8.py │ │ ├── internlm_7b_qlora_oasst1_512_e3.py │ │ ├── internlm_7b_qlora_oasst1_e3.py │ │ ├── internlm_7b_qlora_oasst1_e3_hf.py │ │ ├── internlm_7b_qlora_oasst1_mmlu_e3.py │ │ ├── internlm_7b_qlora_open_platypus_e3.py │ │ ├── internlm_7b_qlora_openorca_e1.py │ │ ├── internlm_7b_qlora_sql_e3.py │ │ └── internlm_7b_qlora_tiny_codes_e1.py │ ├── internlm_chat_20b │ │ ├── internlm_chat_20b_qlora_alpaca_e3.py │ │ ├── internlm_chat_20b_qlora_alpaca_enzh_e3.py │ │ ├── internlm_chat_20b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── internlm_chat_20b_qlora_alpaca_zh_e3.py │ │ ├── internlm_chat_20b_qlora_code_alpaca_e3.py │ │ ├── internlm_chat_20b_qlora_lawyer_e3.py │ │ ├── internlm_chat_20b_qlora_oasst1_512_e3.py │ │ ├── internlm_chat_20b_qlora_oasst1_e3.py │ │ └── internlm_chat_20b_qlora_open_platypus_e3.py │ └── internlm_chat_7b │ │ ├── internlm_chat_7b_qlora_alpaca_e3.py │ │ ├── internlm_chat_7b_qlora_alpaca_enzh_e3.py │ │ ├── internlm_chat_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── internlm_chat_7b_qlora_alpaca_zh_e3.py │ │ ├── internlm_chat_7b_qlora_arxiv_gentitle_e3.py │ │ ├── internlm_chat_7b_qlora_code_alpaca_e3.py │ │ ├── internlm_chat_7b_qlora_colorist_e5.py │ │ ├── internlm_chat_7b_qlora_lawyer_e3.py │ │ ├── internlm_chat_7b_qlora_medical_e1.py │ │ ├── internlm_chat_7b_qlora_oasst1_512_e3.py │ │ ├── internlm_chat_7b_qlora_oasst1_e3.py │ │ ├── internlm_chat_7b_qlora_open_platypus_e3.py │ │ ├── internlm_chat_7b_qlora_openorca_e1.py │ │ ├── internlm_chat_7b_qlora_sql_e3.py │ │ └── internlm_chat_7b_qlora_tiny_codes_e1.py ├── internvl │ ├── README.md │ ├── README_zh-CN.md │ ├── v1_5 │ │ ├── convert_to_official.py │ │ ├── internvl_v1_5_internlm2_26b_finetune.py │ │ ├── internvl_v1_5_internlm2_26b_lora_finetune.py │ │ ├── internvl_v1_5_internlm2_26b_qlora_finetune.py │ │ ├── internvl_v1_5_internlm2_2b_finetune.py │ │ ├── internvl_v1_5_internlm2_2b_lora_finetune.py │ │ ├── internvl_v1_5_internlm2_2b_qlora_finetune.py │ │ ├── internvl_v1_5_phi3_4b_finetune.py │ │ ├── internvl_v1_5_phi3_4b_lora_finetune.py │ │ └── internvl_v1_5_phi3_4b_qlora_finetune.py │ └── v2 │ │ ├── internvl_v2_internlm2_26b_finetune.py │ │ ├── internvl_v2_internlm2_26b_lora_finetune.py │ │ ├── internvl_v2_internlm2_26b_qlora_finetune.py │ │ ├── internvl_v2_internlm2_2b_finetune.py │ │ ├── internvl_v2_internlm2_2b_lora_finetune.py │ │ ├── internvl_v2_internlm2_2b_qlora_finetune.py │ │ ├── internvl_v2_internlm2_5_8b_finetune.py │ │ ├── internvl_v2_internlm2_5_8b_lora_finetune.py │ │ ├── internvl_v2_internlm2_5_8b_qlora_finetune.py │ │ ├── internvl_v2_phi3_4b_finetune.py │ │ ├── internvl_v2_phi3_4b_lora_finetune.py │ │ └── internvl_v2_phi3_4b_qlora_finetune.py ├── llama │ ├── llama2_70b │ │ ├── llama2_70b_full_wizardlm_e1.py │ │ ├── llama2_70b_int8_lora_open_platypus_e1.py │ │ ├── llama2_70b_int8_lora_open_platypus_e1_hf.py │ │ ├── llama2_70b_qlora_open_platypus_e1.py │ │ └── llama2_70b_qlora_open_platypus_e1_hf.py │ ├── llama2_7b │ │ ├── llama2_7b_full_pgbooks_400iters_sp1.py │ │ ├── llama2_7b_full_pgbooks_400iters_sp4.py │ │ ├── llama2_7b_full_wizardlm_e1.py │ │ ├── llama2_7b_qlora_alpaca_e3.py │ │ ├── llama2_7b_qlora_alpaca_enzh_e3.py │ │ ├── llama2_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── llama2_7b_qlora_alpaca_zh_e3.py │ │ ├── llama2_7b_qlora_arxiv_gentitle_e3.py │ │ ├── llama2_7b_qlora_code_alpaca_e3.py │ │ ├── llama2_7b_qlora_colorist_e5.py │ │ ├── llama2_7b_qlora_lawyer_e3.py │ │ ├── llama2_7b_qlora_medical_e1.py │ │ ├── llama2_7b_qlora_moss_sft_all_e1.py │ │ ├── llama2_7b_qlora_moss_sft_all_e2_gpu8.py │ │ ├── llama2_7b_qlora_moss_sft_plugins_e1.py │ │ ├── llama2_7b_qlora_msagent_react_e3_gpu8.py │ │ ├── llama2_7b_qlora_oasst1_512_e3.py │ │ ├── llama2_7b_qlora_oasst1_e3.py │ │ ├── llama2_7b_qlora_open_platypus_e3.py │ │ ├── llama2_7b_qlora_openorca_e1.py │ │ ├── llama2_7b_qlora_sql_e3.py │ │ └── llama2_7b_qlora_tiny_codes_e1.py │ ├── llama2_7b_chat │ │ ├── llama2_7b_chat_qlora_alpaca_e3.py │ │ ├── llama2_7b_chat_qlora_alpaca_enzh_e3.py │ │ ├── llama2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── llama2_7b_chat_qlora_alpaca_zh_e3.py │ │ ├── llama2_7b_chat_qlora_arxiv_gentitle_e3.py │ │ ├── llama2_7b_chat_qlora_code_alpaca_e3.py │ │ ├── llama2_7b_chat_qlora_colorist_e5.py │ │ ├── llama2_7b_chat_qlora_lawyer_e3.py │ │ ├── llama2_7b_chat_qlora_medical_e1.py │ │ ├── llama2_7b_chat_qlora_oasst1_512_e3.py │ │ ├── llama2_7b_chat_qlora_oasst1_e3.py │ │ ├── llama2_7b_chat_qlora_open_platypus_e3.py │ │ ├── llama2_7b_chat_qlora_openorca_e1.py │ │ ├── llama2_7b_chat_qlora_sql_e3.py │ │ └── llama2_7b_chat_qlora_tiny_codes_e1.py │ ├── llama3_70b_instruct │ │ └── llama3_70b_instruct_qlora_alpaca_e3_2k_gpu8.py │ ├── llama3_8b │ │ ├── README.md │ │ └── llama3_8b_full_alpaca_e3.py │ ├── llama3_8b_instruct │ │ ├── llama3_8b_instruct_full_alpaca_e3.py │ │ └── llama3_8b_instruct_qlora_alpaca_e3.py │ └── llama_7b │ │ ├── llama_7b_qlora_alpaca_e3.py │ │ ├── llama_7b_qlora_alpaca_enzh_e3.py │ │ ├── llama_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ ├── llama_7b_qlora_alpaca_zh_e3.py │ │ ├── llama_7b_qlora_arxiv_gentitle_e3.py │ │ ├── llama_7b_qlora_code_alpaca_e3.py │ │ ├── llama_7b_qlora_colorist_e5.py │ │ ├── llama_7b_qlora_lawyer_e3.py │ │ ├── llama_7b_qlora_medical_e1.py │ │ ├── llama_7b_qlora_moss_sft_all_e1.py │ │ ├── llama_7b_qlora_moss_sft_all_e2_gpu8.py │ │ ├── llama_7b_qlora_moss_sft_plugins_e1.py │ │ ├── llama_7b_qlora_oasst1_512_e3.py │ │ ├── llama_7b_qlora_oasst1_e3.py │ │ ├── llama_7b_qlora_open_platypus_e3.py │ │ ├── llama_7b_qlora_openorca_e1.py │ │ ├── llama_7b_qlora_sql_e3.py │ │ └── llama_7b_qlora_tiny_codes_e1.py ├── llama_speed_benchmark │ ├── llama2_70b │ │ ├── llama2_70b_full_alpaca_enzh_128k_sp8.py │ │ ├── llama2_70b_full_alpaca_enzh_256k_sp16.py │ │ ├── llama2_70b_full_alpaca_enzh_32k_sp4.py │ │ └── llama2_70b_full_alpaca_enzh_8k_sp1.py │ ├── llama2_7b │ │ ├── llama2_7b_full_alpaca_enzh_128k_sp8.py │ │ ├── llama2_7b_full_alpaca_enzh_1M_sp16.py │ │ ├── llama2_7b_full_alpaca_enzh_256k_sp8.py │ │ ├── llama2_7b_full_alpaca_enzh_32k_sp1.py │ │ └── llama2_7b_full_alpaca_enzh_8k_sp1.py │ └── yi_34b │ │ ├── yi_34b_200k_full_alpaca_enzh_128k_sp8.py │ │ ├── yi_34b_200k_full_alpaca_enzh_256k_sp8.py │ │ ├── yi_34b_200k_full_alpaca_enzh_32k_sp2.py │ │ └── yi_34b_200k_full_alpaca_enzh_8k_sp1.py ├── llava │ ├── README.md │ ├── README_zh-CN.md │ ├── internlm2_chat_1_8b_clip_vit_large_p14_336 │ │ ├── finetune │ │ │ └── llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ └── pretrain │ │ │ └── llava_internlm2_chat_1_8b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ ├── internlm2_chat_20b_clip_vit_large_p14_336 │ │ ├── finetune │ │ │ ├── llava_internlm2_chat_20b_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ └── llava_internlm2_chat_20b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ └── pretrain │ │ │ └── llava_internlm2_chat_20b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ ├── internlm2_chat_7b_clip_vit_large_p14_336 │ │ ├── finetune │ │ │ ├── llava_internlm2_chat_7b_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ └── llava_internlm2_chat_7b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ └── pretrain │ │ │ └── llava_internlm2_chat_7b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ ├── internlm_chat_7b_clip_vit_large_p14_336 │ │ ├── finetune │ │ │ └── llava_internlm_chat_7b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ └── pretrain │ │ │ └── llava_internlm_chat_7b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ ├── llama3_70b_instruct_clip_vit_large_p14_336 │ │ └── pretrain │ │ │ └── llava_llama3_70b_instruct_quant_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ ├── llama3_8b_instruct_clip_vit_large_p14_336 │ │ ├── README.md │ │ ├── convert_xtuner_weights_to_hf.py │ │ ├── convert_xtuner_weights_to_llava.py │ │ ├── finetune │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_lora_e1_gpu8_internvl_finetune.py │ │ │ └── llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_e1_gpu1_finetune.py │ │ └── pretrain │ │ │ ├── llava_llama3_8b_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ │ ├── llava_llama3_8b_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py │ │ │ └── llava_llama3_8b_instruct_quant_clip_vit_large_p14_336_e1_gpu1_pretrain.py │ ├── official │ │ ├── llava_v15_13b │ │ │ ├── llava_v15_13b_finetune.py │ │ │ ├── llava_v15_13b_finetune_lora.py │ │ │ └── llava_v15_13b_pretrain.py │ │ └── llava_v15_7b │ │ │ ├── llava_v15_7b_finetune.py │ │ │ ├── llava_v15_7b_finetune_lora.py │ │ │ └── llava_v15_7b_pretrain.py │ ├── phi3_mini_4k_instruct_clip_vit_large_p14_336 │ │ ├── README.md │ │ ├── convert_phi_to_llama.py │ │ ├── convert_xtuner_weights_to_hf.py │ │ ├── convert_xtuner_weights_to_llava.py │ │ ├── finetune │ │ │ ├── llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ └── llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_full_e2_gpu8_internvl_finetune.py │ │ └── pretrain │ │ │ ├── llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ │ └── llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py │ ├── vicuna_13b_v15_clip_vit_large_p14_336 │ │ ├── finetune │ │ │ └── llava_vicuna_13b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ └── pretrain │ │ │ └── llava_vicuna_13b_v15_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ └── vicuna_7b_v15_clip_vit_large_p14_336 │ │ ├── finetune │ │ ├── llava_vicuna_7b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ └── llava_vicuna_7b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_refcoco.py │ │ └── pretrain │ │ └── llava_vicuna_7b_v15_clip_vit_large_p14_336_e1_gpu8_pretrain.py ├── minicpm │ ├── 1_2b │ │ ├── minicpm_1b_dpo_qlora.py │ │ ├── minicpm_1b_full_alpaca_zh_e3.py │ │ ├── minicpm_1b_lora_alpaca_zh_e3.py │ │ ├── minicpm_1b_qlora_alpaca_enzh_e3.py │ │ └── minicpm_1b_qlora_alpaca_zh_e3.py │ ├── 2b │ │ ├── minicpm_2b_dpo_qlora.py │ │ ├── minicpm_2b_full_alpaca_zh_e3.py │ │ ├── minicpm_2b_lora_alpaca_zh_e3.py │ │ ├── minicpm_2b_qlora_alpaca_enzh_e3.py │ │ └── minicpm_2b_qlora_alpaca_zh_e3.py │ └── minicpm3_4b │ │ ├── minicpm3_4b_dpo_qlora.py │ │ └── minicpm3_4b_full_alpaca_zh_e3.py ├── mistral │ ├── mistral_7b_full_finetune_custom_dataset_e1.py │ ├── mistral_7b_qlora_skypile_pretrain_e1.py │ ├── mistral_7b_w_tokenized_dataset.py │ └── mistral_7b_w_untokenized_dataset.py ├── mixtral │ ├── README.md │ ├── mixtral_8x7b │ │ ├── mixtral_8x7b_full_oasst1_e3.py │ │ └── mixtral_8x7b_qlora_oasst1_e3.py │ └── mixtral_8x7b_instruct │ │ ├── mixtral_8x7b_instruct_full_oasst1_e3.py │ │ └── mixtral_8x7b_instruct_qlora_oasst1_e3.py ├── orpo │ ├── internlm │ │ ├── internlm2_chat_1_8b_orpo_full.py │ │ ├── internlm2_chat_1_8b_orpo_full_varlenattn.py │ │ ├── internlm2_chat_1_8b_orpo_full_varlenattn_jsonl_dataset.py │ │ └── internlm2_chat_7b_orpo_qlora_varlenattn_ultrafeedback_e5.py │ └── llama │ │ └── llama3_8b_instruct_orpo_qlora_varlenattn_ultrafeedback_e5.py ├── phi │ └── phi3 │ │ ├── phi3_mini_128k_instruct_full_alpaca_e3.py │ │ ├── phi3_mini_128k_instruct_qlora_alpaca_e3.py │ │ ├── phi3_mini_4k_instruct_full_alpaca_e3.py │ │ └── phi3_mini_4k_instruct_qlora_alpaca_e3.py ├── qwen │ ├── qwen1 │ │ ├── qwen_1_8b │ │ │ ├── qwen_1_8b_qlora_alpaca_e3.py │ │ │ ├── qwen_1_8b_qlora_alpaca_enzh_e3.py │ │ │ ├── qwen_1_8b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── qwen_1_8b_qlora_alpaca_zh_e3.py │ │ │ └── qwen_1_8b_qlora_code_alpaca_e3.py │ │ ├── qwen_1_8b_chat │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_enzh_e3.py │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_zh_e3.py │ │ │ └── qwen_1_8b_chat_qlora_code_alpaca_e3.py │ │ ├── qwen_72b │ │ │ ├── qwen_72b_qlora_alpaca_e3.py │ │ │ ├── qwen_72b_qlora_alpaca_enzh_e3.py │ │ │ ├── qwen_72b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── qwen_72b_qlora_alpaca_zh_e3.py │ │ │ └── qwen_72b_qlora_code_alpaca_e3.py │ │ ├── qwen_7b │ │ │ ├── qwen_7b_qlora_alpaca_e3.py │ │ │ ├── qwen_7b_qlora_alpaca_enzh_e3.py │ │ │ ├── qwen_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── qwen_7b_qlora_alpaca_zh_e3.py │ │ │ ├── qwen_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── qwen_7b_qlora_code_alpaca_e3.py │ │ │ ├── qwen_7b_qlora_colorist_e5.py │ │ │ ├── qwen_7b_qlora_lawyer_e3.py │ │ │ ├── qwen_7b_qlora_medical_e1.py │ │ │ ├── qwen_7b_qlora_moss_sft_all_e1.py │ │ │ ├── qwen_7b_qlora_moss_sft_all_e2_gpu8.py │ │ │ ├── qwen_7b_qlora_moss_sft_plugins_e1.py │ │ │ ├── qwen_7b_qlora_oasst1_512_e3.py │ │ │ ├── qwen_7b_qlora_oasst1_e3.py │ │ │ ├── qwen_7b_qlora_open_platypus_e3.py │ │ │ ├── qwen_7b_qlora_openorca_e1.py │ │ │ ├── qwen_7b_qlora_sql_e3.py │ │ │ └── qwen_7b_qlora_tiny_codes_e1.py │ │ └── qwen_7b_chat │ │ │ ├── qwen_7b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen_7b_chat_qlora_alpaca_enzh_e3.py │ │ │ ├── qwen_7b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── qwen_7b_chat_qlora_alpaca_zh_e3.py │ │ │ ├── qwen_7b_chat_qlora_arxiv_gentitle_e3.py │ │ │ ├── qwen_7b_chat_qlora_code_alpaca_e3.py │ │ │ ├── qwen_7b_chat_qlora_colorist_e5.py │ │ │ ├── qwen_7b_chat_qlora_lawyer_e3.py │ │ │ ├── qwen_7b_chat_qlora_medical_e1.py │ │ │ ├── qwen_7b_chat_qlora_oasst1_512_e3.py │ │ │ ├── qwen_7b_chat_qlora_oasst1_e3.py │ │ │ ├── qwen_7b_chat_qlora_open_platypus_e3.py │ │ │ ├── qwen_7b_chat_qlora_openorca_e1.py │ │ │ ├── qwen_7b_chat_qlora_sql_e3.py │ │ │ └── qwen_7b_chat_qlora_tiny_codes_e1.py │ └── qwen1_5 │ │ ├── qwen1_5_0_5b │ │ ├── qwen1_5_0_5b_full_alpaca_e3.py │ │ └── qwen1_5_0_5b_qlora_alpaca_e3.py │ │ ├── qwen1_5_0_5b_chat │ │ ├── qwen1_5_0_5b_chat_full_alpaca_e3.py │ │ └── qwen1_5_0_5b_chat_qlora_alpaca_e3.py │ │ ├── qwen1_5_110b │ │ ├── qwen1_5_110b_full_alpaca_e3.py │ │ └── qwen1_5_110b_qlora_alpaca_e3.py │ │ ├── qwen1_5_110b_chat │ │ ├── README.md │ │ ├── qwen1_5_110b_chat_full_alpaca_e3.py │ │ ├── qwen1_5_110b_chat_qlora_alpaca_e3.py │ │ └── qwen1_5_110b_chat_qlora_alpaca_e3_16k_2gpus.py │ │ ├── qwen1_5_14b │ │ ├── qwen1_5_14b_full_alpaca_e3.py │ │ └── qwen1_5_14b_qlora_alpaca_e3.py │ │ ├── qwen1_5_14b_chat │ │ ├── qwen1_5_14b_chat_full_alpaca_e3.py │ │ └── qwen1_5_14b_chat_qlora_alpaca_e3.py │ │ ├── qwen1_5_1_8b │ │ ├── qwen1_5_1_8b_full_alpaca_e3.py │ │ └── qwen1_5_1_8b_qlora_alpaca_e3.py │ │ ├── qwen1_5_1_8b_chat │ │ ├── qwen1_5_1_8b_chat_full_alpaca_e3.py │ │ └── qwen1_5_1_8b_chat_qlora_alpaca_e3.py │ │ ├── qwen1_5_4b │ │ ├── qwen1_5_4b_full_alpaca_e3.py │ │ ├── qwen1_5_4b_qlora_alpaca_e3.py │ │ └── qwen1_5_4b_qlora_alpaca_e3_openmind.py │ │ ├── qwen1_5_4b_chat │ │ ├── qwen1_5_4b_chat_full_alpaca_e3.py │ │ └── qwen1_5_4b_chat_qlora_alpaca_e3.py │ │ ├── qwen1_5_72b │ │ ├── qwen1_5_72b_full_alpaca_e3.py │ │ └── qwen1_5_72b_qlora_alpaca_e3.py │ │ ├── qwen1_5_72b_chat │ │ ├── qwen1_5_72b_chat_full_alpaca_e3.py │ │ └── qwen1_5_72b_chat_qlora_alpaca_e3.py │ │ ├── qwen1_5_7b │ │ ├── qwen1_5_7b_full_alpaca_e3.py │ │ └── qwen1_5_7b_qlora_alpaca_e3.py │ │ └── qwen1_5_7b_chat │ │ ├── qwen1_5_7b_chat_full_alpaca_e3.py │ │ └── qwen1_5_7b_chat_qlora_alpaca_e3.py ├── qwen_moe │ └── qwen1_5 │ │ └── qwen1_5_moe_a2_7_b_chat │ │ └── qwen1_5_moe_a2_7_b_chat_full_alpaca_e3.py ├── reward_model │ ├── internlm │ │ ├── internlm2_chat_1_8b_reward_full_ultrafeedback.py │ │ ├── internlm2_chat_1_8b_reward_full_varlenattn_jsonl_dataset.py │ │ ├── internlm2_chat_1_8b_reward_full_varlenattn_ultrafeedback.py │ │ └── internlm2_chat_1_8b_reward_qlora_varlenattn_ultrafeedback.py │ └── llama │ │ └── llama3_8b_instruct_reward_full_varlenattn_ultrafeedback.py ├── starcoder │ └── starcoder_qlora_stack_exchange_example.py ├── yi │ ├── yi_34b │ │ └── yi_34b_qlora_alpaca_enzh_e3.py │ └── yi_6b │ │ └── yi_6b_qlora_alpaca_enzh_e3.py └── zephyr │ └── zephyr_7b_beta_qlora_alpaca_e3.py ├── dataset ├── __init__.py ├── collate_fns │ ├── __init__.py │ ├── default_collate_fn.py │ ├── mmlu_collate_fn.py │ └── preference_collate_fn.py ├── concat_dataset.py ├── huggingface.py ├── intern_repo.py ├── internvl_dataset.py ├── json_dataset.py ├── llava.py ├── map_fns │ ├── __init__.py │ ├── dataset_map_fns │ │ ├── __init__.py │ │ ├── alpaca_map_fn.py │ │ ├── alpaca_zh_map_fn.py │ │ ├── arxiv_map_fn.py │ │ ├── code_alpaca_map_fn.py │ │ ├── colors_map_fn.py │ │ ├── crime_kg_assitant_map_fn.py │ │ ├── default_map_fn.py │ │ ├── law_reference_map_fn.py │ │ ├── llava_map_fn.py │ │ ├── medical_map_fn.py │ │ ├── msagent_map_fn.py │ │ ├── oasst1_map_fn.py │ │ ├── openai_map_fn.py │ │ ├── openorca_map_fn.py │ │ ├── pretrain_map_fn.py │ │ ├── sql_map_fn.py │ │ ├── stack_exchange_map_fn.py │ │ ├── tiny_codes_map_fn.py │ │ └── wizardlm_map_fn.py │ └── template_map_fn.py ├── modelscope.py ├── moss_sft.py ├── preference_dataset.py ├── refcoco_json.py ├── samplers │ ├── __init__.py │ ├── intern_repo.py │ └── length_grouped.py └── utils.py ├── engine ├── __init__.py ├── _strategy │ ├── __init__.py │ └── deepspeed.py ├── hooks │ ├── __init__.py │ ├── dataset_info_hook.py │ ├── evaluate_chat_hook.py │ ├── hf_checkpoint_hook.py │ ├── throughput_hook.py │ └── varlen_attn_args_to_messagehub_hook.py └── runner │ ├── __init__.py │ └── loops.py ├── entry_point.py ├── evaluation ├── __init__.py └── metrics │ ├── __init__.py │ ├── mmlu_metric.py │ └── reward_metric.py ├── model ├── __init__.py ├── dpo.py ├── internvl.py ├── llava.py ├── modules │ ├── __init__.py │ ├── dispatch │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── baichuan.py │ │ ├── cohere.py │ │ ├── deepseek_v2.py │ │ ├── internlm.py │ │ ├── internlm2.py │ │ ├── internlm3.py │ │ ├── llama.py │ │ ├── mistral.py │ │ ├── phi3.py │ │ ├── qwen2.py │ │ ├── triton_kernels │ │ │ ├── __init__.py │ │ │ ├── layer_norm.py │ │ │ ├── rms_norm.py │ │ │ └── rotary.py │ │ ├── utils.py │ │ └── yi.py │ └── projector │ │ ├── __init__.py │ │ ├── configuration_projector.py │ │ └── modeling_projector.py ├── orpo.py ├── reward.py ├── sft.py ├── transformers_models │ ├── __init__.py │ ├── deepseek_v2 │ │ ├── __init__.py │ │ ├── configuration_deepseek.py │ │ ├── modeling_deepseek.py │ │ └── tokenization_deepseek_fast.py │ └── mixtral │ │ ├── __init__.py │ │ ├── configuration_mixtral.py │ │ └── modeling_mixtral.py └── utils.py ├── parallel ├── __init__.py └── sequence │ ├── __init__.py │ ├── attention.py │ ├── comm.py │ ├── data_collate.py │ ├── reduce_loss.py │ ├── sampler.py │ └── setup_distributed.py ├── registry.py ├── tools ├── chat.py ├── check_custom_dataset.py ├── copy_cfg.py ├── data_preprocess │ ├── arxiv.py │ └── convert_refcoco.py ├── eval_refcoco.py ├── get_data_order.py ├── list_cfg.py ├── list_dataset_format.py ├── log_dataset.py ├── mmbench.py ├── model_converters │ ├── merge.py │ ├── modeling_internlm2_reward │ │ ├── __init__.py │ │ ├── configuration_internlm2.py │ │ └── modeling_internlm2.py │ ├── pth_to_hf.py │ └── split.py ├── plugins │ ├── __init__.py │ ├── api.py │ ├── calculate.py │ ├── search.py │ └── solve.py ├── process_untokenized_datasets.py ├── process_untokenized_datasets_legacy.py ├── process_untokenized_llava_data.py ├── test.py ├── tokenize_ftdp_datasets.py ├── train.py └── utils.py ├── utils ├── __init__.py ├── constants.py ├── device.py ├── fileio.py ├── handle_moe_load_and_save.py ├── stop_criteria.py ├── templates.py └── zero_to_any_dtype.py └── version.py /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy 2 | 3 | on: push 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | build-n-publish: 11 | runs-on: ubuntu-latest 12 | if: startsWith(github.event.ref, 'refs/tags') 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.8 19 | - name: Build XTuner 20 | run: | 21 | pip install wheel 22 | python setup.py sdist bdist_wheel 23 | - name: Publish distribution to PyPI 24 | run: | 25 | pip install twine 26 | twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} 27 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.8 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.8 18 | - name: Install pre-commit hook 19 | run: | 20 | pip install pre-commit 21 | pre-commit install 22 | - name: Linting 23 | run: pre-commit run --all-files 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/*/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # custom 107 | data/ 108 | data 109 | .vscode 110 | .idea 111 | .DS_Store 112 | *.pkl 113 | *.pkl.json 114 | *.log.json 115 | work_dirs/ 116 | 117 | # Pytorch 118 | *.pth 119 | *.py~ 120 | *.sh~ 121 | 122 | # srun 123 | *.out 124 | batchscript-* 125 | -------------------------------------------------------------------------------- /.owners.yml: -------------------------------------------------------------------------------- 1 | assign: 2 | issues: disabled 3 | pull_requests: disabled 4 | strategy: 5 | random 6 | # daily-shift-based 7 | schedule: 8 | '*/1 * * * *' 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^tests/data/|^xtuner/model/transformers_models/|^xtuner/tools/model_converters/modeling_internlm2_reward/|^xtuner/_lite/modelings/|^xtuner/_lite/accelerate/dispatches/huggingface/ 2 | repos: 3 | - repo: https://github.com/PyCQA/flake8 4 | rev: 5.0.4 5 | hooks: 6 | - id: flake8 7 | args: ["--max-line-length=119"] 8 | - repo: https://github.com/PyCQA/isort 9 | rev: 5.12.0 10 | hooks: 11 | - id: isort 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v5.0.0 14 | hooks: 15 | - id: check-yaml 16 | - id: requirements-txt-fixer 17 | - id: check-merge-conflict 18 | - id: fix-encoding-pragma 19 | args: ["--remove"] 20 | - id: mixed-line-ending 21 | args: ["--fix=lf"] 22 | - repo: https://github.com/codespell-project/codespell 23 | rev: v2.2.1 24 | hooks: 25 | - id: codespell 26 | - repo: https://github.com/executablebooks/mdformat 27 | rev: 0.7.9 28 | hooks: 29 | - id: mdformat 30 | args: ["--number"] 31 | additional_dependencies: 32 | - mdformat-openmmlab 33 | - mdformat_frontmatter 34 | - linkify-it-py 35 | exclude: 'docs/zh_cn/user_guides/sequence_parallel.md' 36 | - repo: https://github.com/myint/docformatter 37 | rev: v1.3.1 38 | hooks: 39 | - id: docformatter 40 | args: ["--in-place", "--wrap-descriptions", "119"] 41 | - repo: https://github.com/open-mmlab/pre-commit-hooks 42 | rev: v0.4.0 43 | hooks: 44 | - id: check-copyright 45 | args: ["xtuner", "--excludes", "xtuner/_lite/modelings/", "xtuner/model/transformers_models/"] 46 | - id: remove-improper-eol-in-cn-docs 47 | - repo: https://github.com/asottile/pyupgrade 48 | rev: v3.0.0 49 | hooks: 50 | - id: pyupgrade 51 | args: ["--py36-plus"] 52 | 53 | - repo: https://github.com/psf/black 54 | rev: 23.9.1 55 | hooks: 56 | - id: black 57 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include xtuner/configs *.py *.yml *.json 2 | recursive-include xtuner/tools *.sh *.py 3 | -------------------------------------------------------------------------------- /docs/en/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.8" 7 | 8 | formats: 9 | - epub 10 | 11 | python: 12 | install: 13 | - requirements: requirements/docs.txt 14 | 15 | sphinx: 16 | configuration: docs/en/conf.py 17 | -------------------------------------------------------------------------------- /docs/en/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/en/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | .header-logo { 2 | background-image: url("../image/logo.png"); 3 | background-size: 177px 40px; 4 | height: 40px; 5 | width: 177px; 6 | } 7 | -------------------------------------------------------------------------------- /docs/en/_static/image/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/xtuner/53f2429d8a4662c04a8a4a2dc5c941672f4d3bdd/docs/en/_static/image/logo.png -------------------------------------------------------------------------------- /docs/en/acceleration/benchmark.rst: -------------------------------------------------------------------------------- 1 | Benchmark 2 | ========= 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/deepspeed.rst: -------------------------------------------------------------------------------- 1 | DeepSpeed 2 | ========= 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/flash_attn.rst: -------------------------------------------------------------------------------- 1 | Flash Attention 2 | =============== 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/hyper_parameters.rst: -------------------------------------------------------------------------------- 1 | HyperParameters 2 | =============== 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/length_grouped_sampler.rst: -------------------------------------------------------------------------------- 1 | Length Grouped Sampler 2 | ====================== 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/pack_to_max_length.rst: -------------------------------------------------------------------------------- 1 | Pack to Max Length 2 | ================== 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/train_extreme_long_sequence.rst: -------------------------------------------------------------------------------- 1 | Train Extreme Long Sequence 2 | =========================== 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/train_large_scale_dataset.rst: -------------------------------------------------------------------------------- 1 | Train Large-scale Dataset 2 | ========================= 3 | -------------------------------------------------------------------------------- /docs/en/acceleration/varlen_flash_attn.rst: -------------------------------------------------------------------------------- 1 | Varlen Flash Attention 2 | ====================== 3 | -------------------------------------------------------------------------------- /docs/en/chat/agent.md: -------------------------------------------------------------------------------- 1 | # Chat with Agent 2 | -------------------------------------------------------------------------------- /docs/en/chat/llm.md: -------------------------------------------------------------------------------- 1 | # Chat with LLM 2 | -------------------------------------------------------------------------------- /docs/en/chat/lmdeploy.md: -------------------------------------------------------------------------------- 1 | # Accelerate chat by LMDeploy 2 | -------------------------------------------------------------------------------- /docs/en/chat/vlm.md: -------------------------------------------------------------------------------- 1 | # Chat with VLM 2 | -------------------------------------------------------------------------------- /docs/en/evaluation/hook.md: -------------------------------------------------------------------------------- 1 | # Evaluation during training 2 | -------------------------------------------------------------------------------- /docs/en/evaluation/mmbench.md: -------------------------------------------------------------------------------- 1 | # MMBench (VLM) 2 | -------------------------------------------------------------------------------- /docs/en/evaluation/mmlu.md: -------------------------------------------------------------------------------- 1 | # MMLU (LLM) 2 | -------------------------------------------------------------------------------- /docs/en/evaluation/opencompass.md: -------------------------------------------------------------------------------- 1 | # Evaluate with OpenCompass 2 | -------------------------------------------------------------------------------- /docs/en/get_started/installation.md: -------------------------------------------------------------------------------- 1 | ### Installation 2 | 3 | In this section, we will show you how to install XTuner. 4 | 5 | ## Installation Process 6 | 7 | We recommend users to follow our best practices for installing XTuner. 8 | It is recommended to use a conda virtual environment with Python-3.10 to install XTuner. 9 | 10 | ### Best Practices 11 | 12 | **Step 0.** Create a Python-3.10 virtual environment using conda. 13 | 14 | ```shell 15 | conda create --name xtuner-env python=3.10 -y 16 | conda activate xtuner-env 17 | ``` 18 | 19 | **Step 1.** Install XTuner. 20 | 21 | Case a: Install XTuner via pip: 22 | 23 | ```shell 24 | pip install -U xtuner 25 | ``` 26 | 27 | Case b: Install XTuner with DeepSpeed integration: 28 | 29 | ```shell 30 | pip install -U 'xtuner[deepspeed]' 31 | ``` 32 | 33 | Case c: Install XTuner from the source code: 34 | 35 | ```shell 36 | git clone https://github.com/InternLM/xtuner.git 37 | cd xtuner 38 | pip install -e '.[all]' 39 | # "-e" indicates installing the project in editable mode, so any local modifications to the code will take effect without reinstalling. 40 | ``` 41 | 42 | ## Verify the installation 43 | 44 | To verify if XTuner is installed correctly, we will use a command to print the configuration files. 45 | 46 | **Print Configuration Files:** Use the command `xtuner list-cfg` in the command line to verify if the configuration files can be printed. 47 | 48 | ```shell 49 | xtuner list-cfg 50 | ``` 51 | 52 | You should see a list of XTuner configuration files, corresponding to the ones in [xtuner/configs](https://github.com/InternLM/xtuner/tree/main/xtuner/configs) in the source code. 53 | -------------------------------------------------------------------------------- /docs/en/get_started/overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This chapter introduces you to the framework and workflow of XTuner, and provides detailed tutorial links. 4 | 5 | ## What is XTuner 6 | -------------------------------------------------------------------------------- /docs/en/internevo_migration/ftdp_dataset/Case1.rst: -------------------------------------------------------------------------------- 1 | Case 1 2 | ====== 3 | -------------------------------------------------------------------------------- /docs/en/internevo_migration/ftdp_dataset/Case2.rst: -------------------------------------------------------------------------------- 1 | Case 2 2 | ====== 3 | -------------------------------------------------------------------------------- /docs/en/internevo_migration/ftdp_dataset/Case3.rst: -------------------------------------------------------------------------------- 1 | Case 3 2 | ====== 3 | -------------------------------------------------------------------------------- /docs/en/internevo_migration/ftdp_dataset/Case4.rst: -------------------------------------------------------------------------------- 1 | Case 4 2 | ====== 3 | -------------------------------------------------------------------------------- /docs/en/internevo_migration/ftdp_dataset/ftdp.rst: -------------------------------------------------------------------------------- 1 | ftdp 2 | ==== 3 | -------------------------------------------------------------------------------- /docs/en/internevo_migration/internevo_migration.rst: -------------------------------------------------------------------------------- 1 | InternEVO Migration 2 | =================== 3 | -------------------------------------------------------------------------------- /docs/en/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/en/models/supported.md: -------------------------------------------------------------------------------- 1 | # Supported Models 2 | -------------------------------------------------------------------------------- /docs/en/notes/changelog.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | # Changelog 16 | 17 | ## v0.1.0 (2023.08.30) 18 | 19 | XTuner is released! 🔥🔥🔥 20 | 21 | ### Highlights 22 | 23 | - XTuner supports LLM fine-tuning on consumer-grade GPUs. The minimum GPU memory required for 7B LLM fine-tuning is only **8GB**. 24 | - XTuner supports various LLMs, datasets, algorithms and training pipelines. 25 | - Several fine-tuned adapters are released simultaneously, including various gameplays such as the colorist LLM, plugins-based LLM, and many more. For further details, please visit [XTuner on HuggingFace](https://huggingface.co/xtuner)! 26 | -------------------------------------------------------------------------------- /docs/en/preparation/pretrained_model.rst: -------------------------------------------------------------------------------- 1 | Pretrained Model 2 | ================ 3 | -------------------------------------------------------------------------------- /docs/en/preparation/prompt_template.rst: -------------------------------------------------------------------------------- 1 | Prompt Template 2 | =============== 3 | -------------------------------------------------------------------------------- /docs/en/switch_language.md: -------------------------------------------------------------------------------- 1 | ## English 2 | 3 | ## 简体中文 4 | -------------------------------------------------------------------------------- /docs/en/training/custom_agent_dataset.rst: -------------------------------------------------------------------------------- 1 | Custom Agent Dataset 2 | ==================== 3 | -------------------------------------------------------------------------------- /docs/en/training/custom_pretrain_dataset.rst: -------------------------------------------------------------------------------- 1 | Custom Pretrain Dataset 2 | ======================= 3 | -------------------------------------------------------------------------------- /docs/en/training/custom_sft_dataset.rst: -------------------------------------------------------------------------------- 1 | Custom SFT Dataset 2 | ================== 3 | -------------------------------------------------------------------------------- /docs/en/training/modify_settings.rst: -------------------------------------------------------------------------------- 1 | Modify Settings 2 | =============== 3 | -------------------------------------------------------------------------------- /docs/en/training/multi_modal_dataset.rst: -------------------------------------------------------------------------------- 1 | Multi-modal Dataset 2 | =================== 3 | -------------------------------------------------------------------------------- /docs/en/training/open_source_dataset.rst: -------------------------------------------------------------------------------- 1 | Open Source Datasets 2 | ==================== 3 | -------------------------------------------------------------------------------- /docs/en/training/visualization.rst: -------------------------------------------------------------------------------- 1 | Visualization 2 | ============= 3 | -------------------------------------------------------------------------------- /docs/zh_cn/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.8" 7 | 8 | formats: 9 | - epub 10 | 11 | python: 12 | install: 13 | - requirements: requirements/docs.txt 14 | 15 | sphinx: 16 | configuration: docs/zh_cn/conf.py 17 | -------------------------------------------------------------------------------- /docs/zh_cn/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/zh_cn/_static/image/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/xtuner/53f2429d8a4662c04a8a4a2dc5c941672f4d3bdd/docs/zh_cn/_static/image/logo.png -------------------------------------------------------------------------------- /docs/zh_cn/acceleration/flash_attn.rst: -------------------------------------------------------------------------------- 1 | .. _flash_attn: 2 | 3 | Flash Attention 4 | ================================================== 5 | 6 | Flash Attention (Flash Attention 2) 是一种用于加速 Transformer 模型中 Attention 计算,并减少其显存消耗的算法。XTuner 中 Flash Attention (Flash Attention 2) 的支持情况如下表所示: 7 | 8 | .. list-table:: 9 | :widths: 25 50 10 | :header-rows: 1 11 | 12 | * - 模型 13 | - Flash Attention 支持情况 14 | * - baichuan 1/2 15 | - ❌ 16 | * - chatglm 2/3 17 | - ❌ 18 | * - deepseek 19 | - ✅ 20 | * - gemma 21 | - ❌ 22 | * - internlm 1/2 23 | - ✅ 24 | * - llama 2 25 | - ✅ 26 | * - mistral 27 | - ✅ 28 | * - qwen 1/1.5 29 | - ✅ 30 | * - starcoder 31 | - ✅ 32 | * - yi 33 | - ✅ 34 | * - zephyr 35 | - ✅ 36 | 37 | .. note:: 38 | XTuner 会根据运行环境自动控制 Flash Attention 的使用情况 (见 `dispatch_modules `_): 39 | 40 | .. list-table:: 41 | :widths: 50 50 42 | :header-rows: 1 43 | 44 | * - 环境 45 | - Flash Attention 使用情况 46 | * - 安装 `flash attn `_ 47 | - Flash Attention 2 48 | * - 未安装 `flash attn `_ 且 PyTorch Version <= 1.13 49 | - No Flash Attention 50 | * - 未安装 `flash attn `_ 且 2.0 <= PyTorch Version <= 2.1 51 | - Flash Attention 1 52 | * - 未安装 `flash attn `_ 且 PyTorch Version >= 2.2 53 | - Flash Attention 2 54 | 55 | .. note:: 56 | 使用 XTuner 训练 QWen1/1.5 时若想使用 Flash Attention 加速,需要先安装 `flash attn `_ (参考 `flash attn 安装 `_,需要 cuda ) 57 | -------------------------------------------------------------------------------- /docs/zh_cn/acceleration/hyper_parameters.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | 调整加速策略 3 | ===================== 4 | 5 | 本节将会列举 XTuner 中会影响训练速度的配置项。 6 | 7 | 8 | max_length 9 | ------------------- 10 | 11 | ``max_length`` 表示在数据预处理过程中,单条数据长度超过 ``max_length`` 的部分会被截断,基本所有实验都会设置该项。 12 | 13 | pack_to_max_length 14 | --------------------------- 15 | 16 | ``pack_to_max_length`` 用于配置是否进行\ :ref:`数据集拼接 ` \ 。 17 | 18 | ``pack_to_max_length = True`` 表示在数据预处理过程中将多条短数据拼接为一条长度为 ``max_length`` 的长数据,该配置可以大幅提升训练速度。 19 | 20 | 若 ``pack_to_max_length = False``,则推荐将 ``batch_size`` 适度调大以保证训练的稳定性。 21 | 22 | use_varlen_attn 23 | --------------------------- 24 | 25 | ``use_varlen_attn`` 用于配置是否在训练过程中使用\ :ref:`Varlen Flash Attention ` \ 。 26 | 27 | 当 ``use_varlen_attn = True`` 时,要求 ``pack_to_max_length`` 也要设置为 True。在此情况下,每个 token 在注意力计算阶段仅会关注其所在短数据中的所有 tokens (而非整个序列)。 28 | 29 | 当 ``use_varlen_attn = False`` 时,每个 token 在注意力计算阶段会关注整个序列。 30 | 31 | max_position_embeddings 32 | --------------------------------- 33 | 34 | 当需要扩展模型上下文窗口的大小时,需要将 ``max_position_embeddings`` 设置为期望的上下文长度。 **需要保证 max_position_embeddings 不大于 max_length。**\ 35 | 36 | 假设需要将 Llama2-7B 模型支持的上下文长度自 4k 拓展为 32k: 37 | 38 | 1. 若训练数据集中存在较多长度接近 32k 的数据,则推荐 ``max_length = 32k, pack_to_max_length = False, use_varlen_attn = False, max_position_embeddings = 32k`` 这一配置 39 | 2. 若训练数据集中长度接近 32k 的数据量较少甚至没有时,则推荐 ``max_length = 32k, pack_to_max_length = True, use_varlen_attn = False, max_position_embeddings = 32k`` 这一配置 40 | 41 | sequence_parallel_size 42 | ------------------------------------------- 43 | 44 | 在使用序列并行策略训练超长序列时, ``sequence_parallel_size`` 个 GPUs 会共同计算一条长序列。而 ``accumulative_counts`` 则用于控制模型参数更新的频率。 45 | 46 | 47 | accumulative_counts 48 | ---------------------------------------------- 49 | 用于控制模型参数更新的频率;假设需要在 N 块 GPUs 上执行 ``batch_size_per_device = 1, max_length = 128k`` 的训练策略。当设置序列并行维度为 ``sequence_parallel_size`` 后,为了保证训练的等价性, ``accumulative_counts`` 需要设置为原来的 ``sequence_parallel_size`` 倍,因为 128k 长度的序列会被切分为 ``sequence_parallel_size`` 份后分发给 ``sequence_parallel_size`` 个 GPUs 进行训练, ``data_parallel_world_size`` 会变为原来的 :math:`\frac{1}{sequence\_parallel\_size}`。 50 | -------------------------------------------------------------------------------- /docs/zh_cn/acceleration/length_grouped_sampler.rst: -------------------------------------------------------------------------------- 1 | .. _length_grouped_sampler: 2 | 3 | 数据分组 4 | ======================== 5 | 6 | .. raw:: html 7 | 8 |
9 | 10 |
11 | 12 | 生成式大模型(例如LLM)的训练数据往往是不定长的,这就导致同一批次(batch)内的数据长短不一。为实现并行化训练,一种常见的做法是将同一批次的数据填充到最长长度。然而,这一填充(Pad)操作会导致训练的低效。如上图,假设数据内各样本的长度分别为 13 | 2、3、7、9,期望分为2个批次进行训练,那么如果使用默认的随机采样器(左侧),数据处理阶段会引入过多的填充数据,实际效率只有65.6%。 14 | 15 | 现阶段有两种技术方案可以解决 / 缓解这一问题(两者选其一即可,优先考虑 16 | **数据拼接技术**\ ): 17 | 18 | 1. 利用 19 | **数据拼接技术**\ ,将多条数据拼接至训练支持的最大长度。这一做法可以确保同一批次内的数据长度完全一致,进而避免了填充数据所导致的训练效率降低。具体可参考 20 | \ :ref:`数据拼接文档 ` \ 。 21 | 22 | :优点: 可以合并多个数据样本,显著降低训练 iter 数,加速效果好。 23 | 24 | :缺点: 随机合并的多个数据样本间会互相影响,进而影响训练效果(实际影响程度未知);数据进行了合并,丢失了一定数据随机性。 25 | 26 | 2. (本文)利用 27 | **基于数据长度分组的采样器**\ ,在构建批次数据时,基于实际长度进行排序,确保同一批次内的数据长度尽可能相近,进而尽可能减少填充的长度。如上图右侧,利用该采样器后,同样的数据效率将提升至87.5%。 28 | 29 | :优点: 每条数据依然独立存在(独立计算 30 | attention),避免数据拼接技术导致的数据样本间的互相影响;数据进行了分组,丢失了一定数据随机性。 31 | 32 | :缺点: 在数据样本长度比较一致的情况下,加速效果一般。 33 | 34 | 使用 ``LengthGroupedSampler`` 35 | ----------------------------------------- 36 | 37 | XTuner 中基于数据长度分组的采样器 的实现在 38 | `这里 `__\ 。用户可以通过在配置文件中修改 39 | ``train_dataloader`` 的 ``sampler`` 参数进行配置。以 40 | `internlm2_chat_7b_qlora_oasst1_512_e3 `__ 41 | 配置文件为例,其默认是使用随机的采样器,我们可以通过下列修改使其使用 42 | 基于数据长度分组的采样器: 43 | 44 | .. code:: diff 45 | 46 | - from mmengine.dataset import DefaultSampler 47 | + from xtuner.dataset.samplers import LengthGroupedSampler 48 | 49 | batch_size = 16 # per_device 50 | accumulative_counts = 1 51 | 52 | train_dataloader = dict( 53 | batch_size=batch_size, 54 | num_workers=dataloader_num_workers, 55 | dataset=train_dataset, 56 | - sampler=dict(type=DefaultSampler, shuffle=True), 57 | + sampler=dict( 58 | + type=LengthGroupedSampler, 59 | + length_property='length', 60 | + per_device_batch_size=batch_size * accumulative_counts), 61 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 62 | 63 | .. note:: 64 | 其中,\ ``length_property`` 65 | 需要传入获取数据集长度的“属性”,这一数值在通过 ``process_hf_dataset`` 66 | 构建数据集时会自动设置为 67 | ``'length'``\ (因此,如果使用自定义的数据类,请确保这一属性的正确设置)。 68 | -------------------------------------------------------------------------------- /docs/zh_cn/acceleration/pack_to_max_length.rst: -------------------------------------------------------------------------------- 1 | .. _pack_to_max_length: 2 | 3 | 数据拼接 4 | ========================= 5 | 6 | 简介 7 | --------- 8 | 9 | 对于大型语言模型(LLM)的输入而言,“数据集拼接” 这一概念指的是将多个 token 序列拼接成一个单独的输入。大量的数据集都存在一个特点,即其长度分布严重偏向较短的序列,而 Transformers 模型接收固定长度的输入。因此,在模型训练过程中,通常需要将每条数据 "Pad" 至当前 batch 最长序列的长度,而 "Pad Token" 往往是某个特定的无意义的 token。 10 | 11 | 将多条数据打包在一起可以不再需要使用 "Pad Token" 进行无意义的填充,减少计算资源的浪费,同时还可以保持模型作为具有固定大小输入的静态图表示的优点。 12 | 13 | 下表展示了 InternLM2 7B 模型在 Alpaca 数据集上使用不同数据集拼接策略进行训练的速度对比,如表所示,“数据集拼接”会大幅度提升训练效率: 14 | 15 | .. list-table:: 16 | :widths: 25 25 15 17 | :header-rows: 1 18 | 19 | * - 拼接策略 20 | - 每秒处理 token 数 21 | - 加速比 22 | * - 不使用 23 | - 362.9 24 | - 25 | * - 拼接至 2k 26 | - 2677.1 27 | - 7.38x 28 | * - 拼接至 4k 29 | - 3124.3 30 | - 8.61x 31 | * - 拼接至 8k 32 | - 3173.9 33 | - 8.76x 34 | * - 拼接至 16k 35 | - 2864.4 36 | - 7.89x 37 | * - 拼接至 32k 38 | - 2965.4 39 | - 8.17x 40 | 41 | 使用数据拼接 42 | --------------------------- 43 | 44 | XTuner 中提供的 config 文件中默认使用了“数据集拼接”这一功能,可以通过设置 ``max_length`` 字段来调整数据拼接长度。例如可通过以下方式将拼接长度调整为 32k : 45 | 46 | .. code-block:: diff 47 | 48 | ####################################################################### 49 | # PART 1 Settings # 50 | ####################################################################### 51 | - max_length = 2048 52 | + max_length = 32768 53 | pack_to_max_length = True 54 | 55 | ####################################################################### 56 | # PART 3 Dataset & Dataloader # 57 | ####################################################################### 58 | train_dataset = dict( 59 | max_length=max_length, 60 | pack_to_max_length=pack_to_max_length, 61 | ...) 62 | 63 | .. tip:: 64 | 若不想使用数据拼接,在 config 中将 ``pack_to_max_length`` 设为 False 即可, 65 | 此时 config 中的 ``max_length`` 字段表示单条数据最长的 token 数,整个 batch 会被 pad 成当前 batch 内最长的一条数据的长度。 66 | 67 | .. tip:: 68 | 在不使用数据拼接策略时,XTuner 还提供了一种数据集采样策略 (``LengthGroupedSampler``),可以保证在一个 batch 中的数据长度尽可能接近, 69 | 以减少 Pad 对计算资源的浪费。详细用法请参考 70 | \ :ref:`LengthGroupedSampler 文档 ` \ 。 71 | -------------------------------------------------------------------------------- /docs/zh_cn/chat/agent.md: -------------------------------------------------------------------------------- 1 | # 智能体模型对话 2 | -------------------------------------------------------------------------------- /docs/zh_cn/chat/llm.md: -------------------------------------------------------------------------------- 1 | # 语言模型对话 2 | -------------------------------------------------------------------------------- /docs/zh_cn/chat/lmdeploy.md: -------------------------------------------------------------------------------- 1 | # 使用 LMDeploy 优化推理速度 2 | -------------------------------------------------------------------------------- /docs/zh_cn/chat/vlm.md: -------------------------------------------------------------------------------- 1 | # 视觉-语言模型对话 2 | -------------------------------------------------------------------------------- /docs/zh_cn/dpo/overview.md: -------------------------------------------------------------------------------- 1 | ## DPO 介绍 2 | 3 | ### 简介 4 | 5 | DPO(Direct Preference Optimization,直接偏好优化)是一种在大语言模型训练中用于直接优化模型偏好的方法。与传统的强化学习方法不同,DPO 直接使用人类偏好数据进行模型优化,从而提高生成内容的质量,使其更符合人类偏好。DPO 利用人类偏好数据,直接对模型进行优化,省略了训练 Reward Model 的训练过程,与 PPO 相比进一步省去了 Critic Model,不但避免了复杂的强化学习算法,减少了训练开销,同时还提高了训练效率。 6 | 7 | DPO 拥有大量的衍生算法,它们对 DPO 的损失函数进行了一定程度上的改进,我们在 XTuner 中除了 DPO 还实现了[Identity Preference Optimisation (IPO)](https://huggingface.co/papers/2310.12036),[Kahneman-Tversky Optimisation (KTO)](https://github.com/ContextualAI/HALOs)等论文中的损失函数,如需使用这些算法,请参考[修改 DPO 配置](./modify_settings.md)章节。我们也提供了一些[示例配置](https://github.com/InternLM/xtuner/tree/main/xtuner/configs/dpo)用于参考。 8 | 9 | 除了 DPO 之外,还出现了如 [ORPO](https://arxiv.org/abs/2403.07691) 等无需参考模型的对齐算法。ORPO 采用了对数比值(odds ratio)的概念来优化模型,通过在模型训练过程中惩罚那些被拒绝的样本,从而更有效地适应被选择的样本。ORPO 消除了对参考模型的依赖,使得训练过程更加简化且高效。XTuner 中 ORPO 的训练方式与 DPO 非常类似,我们提供了一些 ORPO 的[示例配置](https://github.com/InternLM/xtuner/tree/main/xtuner/configs/orpo),用户可以参考 DPO 的教程对配置进行修改。 10 | 11 | ### XTuner 中 DPO 训练的优势 12 | 13 | XTuner 中的 DPO 训练具备以下显著优势: 14 | 15 | 1. **支持最新的算法**:XTuner除了支持标准的 DPO 之外,还支持了大量的衍生算法,同时也支持ORPO等不依赖参考模型的高效算法。 16 | 17 | 2. **减少显存浪费**:由于偏好数据中的 chosen 和 rejected 数据通常存在长度上的差异,因此在训练数据的拼接时会存在填充(padding token),造成显存浪费。在 XTuner 中,基于 Flash Attention2 中的[变长注意力](https://xtuner.readthedocs.io/zh-cn/latest/acceleration/varlen_flash_attn.html)功能,我们在训练过程中通过将偏好数据打包到同一个序列中,显著减少了由于 padding token 带来的显存浪费。这不仅提高了显存的利用效率,还使得在相同硬件条件下可以训练更大的模型或处理更多的数据。 18 | 19 | ![img](../reward_model/images/var_len_atten.png) 20 | 21 | 3. **高效训练**:借助 XTuner 的 QLoRA 训练功能,参考模型能够被转化为移除LoRA适配器的语言模型,从而省去了参考模型权重的显存占用,大幅降低了 DPO 的训练开销。 22 | 23 | 4. **长文本训练**: 借助 XTuner 的序列并行功能,能够对长文本数据进行训练。 24 | 25 | ### 开始训练 26 | 27 | 请参阅[快速上手](./quick_start.md)来了解最基本的概念,若希望了解更多训练参数配置相关的内容,请参考[修改DPO配置](./modify_settings.md)章节。 28 | -------------------------------------------------------------------------------- /docs/zh_cn/dpo/quick_start.md: -------------------------------------------------------------------------------- 1 | ## DPO 快速上手 2 | 3 | 在本章节中,我们将介绍如何使用 XTuner 训练 1.8B 的 DPO(Direct Preference Optimization)模型,以帮助您快速上手。 4 | 5 | ### 准备预训练模型权重 6 | 7 | 我们使用经过 SFT 的语言模型[InternLM2-chat-1.8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft)作为 DPO 模型的初始化模型来进行偏好对齐。 8 | 9 | 在训练配置文件中设置`pretrained_model_name_or_path = 'internlm/internlm2-chat-1_8b-sft'`,则会在启动训练时自动下载模型文件。若您需要手动下载模型权重,那么请参考[准备预训练模型权重](https://xtuner.readthedocs.io/zh-cn/latest/preparation/pretrained_model.html)章节,其中详细说明了如何从 Huggingface 或者是 Modelscope 下载模型权重的方法。这里我们附上模型的 HuggingFace 链接与 ModelScope 链接: 10 | 11 | - HuggingFace 链接位于:https://huggingface.co/internlm/internlm2-chat-1_8b-sft 12 | - ModelScope 链接位于:https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary 13 | 14 | ### 准备训练数据 15 | 16 | 在本教程中使用 Huggingface 上的[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)数据集作为演示, 17 | 18 | ```python 19 | train_dataset = dict( 20 | type=build_preference_dataset, 21 | dataset=dict( 22 | type=load_dataset, 23 | path='mlabonne/orpo-dpo-mix-40k'), 24 | dataset_map_fn=orpo_dpo_mix_40k_map_fn, 25 | is_dpo=True, 26 | is_reward=False, 27 | ) 28 | ``` 29 | 30 | 在配置文件中使用以上配置,即可自动下载并处理该数据集。如果您希望使用其他 Huggingface 上的开源数据集或是使用自定义的数据集,请参阅[偏好数据集](../reward_model/preference_data.md)章节。 31 | 32 | ### 准备配置文件 33 | 34 | XTuner 提供了多个开箱即用的配置文件,可以通过 `xtuner list-cfg` 查看。我们执行如下指令,以复制一个配置文件到当前目录。 35 | 36 | ```bash 37 | xtuner copy-cfg internlm2_chat_1_8b_dpo_full . 38 | ``` 39 | 40 | 打开复制后的配置文件,如果您选择自动下载模型和数据集,则无需修改配置。若您希望填入您预先下载的模型路径和数据集路径,请修改配置中的`pretrained_model_name_or_path`以及`train_dataset`中`dataset`的`path`参数。 41 | 42 | 更多的训练参数配置,请参阅[修改DPO训练配置](./modify_settings.md)章节。 43 | 44 | ### 启动训练 45 | 46 | 在完成上述操作后,便可以使用下面的指令启动训练任务了。 47 | 48 | ```bash 49 | # 单机单卡 50 | xtuner train ./internlm2_chat_1_8b_dpo_full_copy.py 51 | # 单机多卡 52 | NPROC_PER_NODE=${GPU_NUM} xtuner train ./internlm2_chat_1_8b_dpo_full_copy.py 53 | # slurm 集群 54 | srun ${SRUN_ARGS} xtuner train ./internlm2_chat_1_8b_dpo_full_copy.py --launcher slurm 55 | ``` 56 | 57 | ### 模型转换 58 | 59 | XTuner 已经集成好了将模型转换为 HuggingFace 格式的工具,我们只需要执行 60 | 61 | ```bash 62 | # 创建存放 hf 格式参数的目录 63 | mkdir work_dirs/internlm2_chat_1_8b_dpo_full_copy/iter_15230_hf 64 | 65 | # 转换格式 66 | xtuner convert pth_to_hf internlm2_chat_1_8b_dpo_full_copy.py \ 67 | work_dirs/internlm2_chat_1_8b_dpo_full_copy.py/iter_15230.pth \ 68 | work_dirs/internlm2_chat_1_8b_dpo_full_copy.py/iter_15230_hf 69 | ``` 70 | 71 | 便能够将 XTuner 的 ckpt 转换为 Huggingface 格式的模型。 72 | -------------------------------------------------------------------------------- /docs/zh_cn/evaluation/hook.md: -------------------------------------------------------------------------------- 1 | # 训练过程中评测 2 | -------------------------------------------------------------------------------- /docs/zh_cn/evaluation/mmbench.md: -------------------------------------------------------------------------------- 1 | # MMBench (VLM) 2 | -------------------------------------------------------------------------------- /docs/zh_cn/evaluation/mmlu.md: -------------------------------------------------------------------------------- 1 | # MMLU (LLM) 2 | -------------------------------------------------------------------------------- /docs/zh_cn/evaluation/opencompass.md: -------------------------------------------------------------------------------- 1 | # 使用 OpenCompass 评测 2 | -------------------------------------------------------------------------------- /docs/zh_cn/get_started/installation.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | 安装 3 | ================================== 4 | 5 | 本节中,我们将演示如何安装 XTuner。 6 | 7 | 最佳实践 8 | ======== 9 | 10 | 我们推荐用户参照我们的最佳实践安装 XTuner。 11 | 推荐使用 Python-3.10 的 conda 虚拟环境安装 XTuner。 12 | 13 | **步骤 0.** 使用 conda 先构建一个 Python-3.10 的虚拟环境 14 | 15 | .. code-block:: console 16 | 17 | $ conda create --name xtuner-env python=3.10 -y 18 | $ conda activate xtuner-env 19 | 20 | **步骤 1.** 安装 XTuner 21 | 22 | 方案a: 通过 pip 直接安装 23 | 24 | .. code-block:: console 25 | 26 | $ pip install -U 'xtuner[deepspeed]' 27 | 28 | 方案b: 从源码安装 29 | 30 | .. code-block:: console 31 | 32 | $ git clone https://github.com/InternLM/xtuner.git 33 | $ cd xtuner 34 | $ pip install -e '.[deepspeed]' 35 | 36 | .. note:: 37 | 38 | "-e" 表示在可编辑模式下安装项目,因此对代码所做的任何本地修改都会生效 39 | 40 | 验证 41 | ======== 42 | 43 | 为了验证 XTuner 是否安装正确,我们将使用命令打印配置文件。 44 | 45 | **打印配置文件:** 在命令行中使用 ``xtuner list-cfg`` 验证是否能打印配置文件列表。 46 | 47 | .. code-block:: console 48 | 49 | $ xtuner list-cfg 50 | -------------------------------------------------------------------------------- /docs/zh_cn/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/zh_cn/models/supported.md: -------------------------------------------------------------------------------- 1 | # 已支持的模型 2 | -------------------------------------------------------------------------------- /docs/zh_cn/notes/changelog.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | # 变更日志 16 | 17 | ## v0.1.0 (2023.08.30) 18 | 19 | XTuner 正式发布!🔥🔥🔥 20 | 21 | ### 亮点 22 | 23 | - XTuner 支持使用消费级显卡微调大语言模型。微调 7B 大语言模型的最低显存开销仅为 **8GB**。 24 | - XTuner 支持多种大语言模型、数据集、微调算法和训练流程。 25 | - 众多微调好的 adapter 也同步发布,包括调色师、插件对话等多种玩法。更多信息,请访问 [HuggingFace 仓库](https://huggingface.co/xtuner)。 26 | -------------------------------------------------------------------------------- /docs/zh_cn/reward_model/images/preference_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/xtuner/53f2429d8a4662c04a8a4a2dc5c941672f4d3bdd/docs/zh_cn/reward_model/images/preference_data.png -------------------------------------------------------------------------------- /docs/zh_cn/reward_model/images/sequence_parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/xtuner/53f2429d8a4662c04a8a4a2dc5c941672f4d3bdd/docs/zh_cn/reward_model/images/sequence_parallel.png -------------------------------------------------------------------------------- /docs/zh_cn/reward_model/images/var_len_atten.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/xtuner/53f2429d8a4662c04a8a4a2dc5c941672f4d3bdd/docs/zh_cn/reward_model/images/var_len_atten.png -------------------------------------------------------------------------------- /docs/zh_cn/switch_language.md: -------------------------------------------------------------------------------- 1 | ## English 2 | 3 | ## 简体中文 4 | -------------------------------------------------------------------------------- /docs/zh_cn/training/visualization.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | 可视化训练过程 3 | ============== 4 | 5 | XTuner 支持通过 `MMEngine `__ 6 | 使用 `TensorBoard `__ 7 | 和 `Weights & Biases (WandB) `__ 8 | 实验管理工具,只需在 config 中添加一行代码,就可以跟踪和可视化损失、显存占用等指标。 9 | 10 | TensorBoard 11 | ============ 12 | 13 | 1. 设置 config 中的 ``visualizer`` 字段,并将 ``vis_backends`` 设置为 `TensorboardVisBackend `__\ : 14 | 15 | .. code:: diff 16 | 17 | # set visualizer 18 | - visualizer = None 19 | + from mmengine.visualization import Visualizer, TensorboardVisBackend 20 | + visualizer = dict(type=Visualizer, vis_backends=[dict(type=TensorboardVisBackend)]) 21 | 22 | 2. 启动实验后,tensorboard 产生的相关文件会存在 ``vis_data`` 中,通过 tensorboard 命令可以启动进行实时可视化: 23 | 24 | |image1| 25 | 26 | .. code:: 27 | 28 | tensorboard --logdir=$PATH_TO_VIS_DATA 29 | 30 | WandB 31 | ====== 32 | 33 | 1. 使用 WandB 前需安装依赖库 ``wandb`` 并登录至 wandb。 34 | 35 | .. code:: console 36 | 37 | $ pip install wandb 38 | $ wandb login 39 | 40 | 2. 设置 config 中的 ``visualizer`` 字段,并将 ``vis_backends`` 设置为 `WandbVisBackend `__\ : 41 | 42 | .. code:: diff 43 | 44 | # set visualizer 45 | + from mmengine.visualization import Visualizer, WandbVisBackend 46 | - visualizer = None 47 | + visualizer = dict(type=Visualizer, vis_backends=[dict(type=WandbVisBackend)]) 48 | 49 | .. tip:: 50 | 可以点击 `WandbVisBackend 51 | API `__ 52 | 查看 ``WandbVisBackend`` 可配置的参数。例如 53 | ``init_kwargs``\ ,该参数会传给 54 | `wandb.init `__ 方法。 55 | 56 | .. code:: diff 57 | 58 | # set visualizer 59 | - visualizer = None 60 | + from mmengine.visualization import Visualizer, WandbVisBackend 61 | + visualizer = dict( 62 | + type=Visualizer, 63 | + vis_backends=[ 64 | + dict(type=WandbVisBackend, init_kwargs=dict(project='toy-example'))]) 65 | 66 | 67 | 3. 启动实验后,可在 wandb 网页端 ``https://wandb.ai`` 上查看可视化结果: 68 | 69 | |image2| 70 | 71 | 72 | .. |image1| image:: https://github.com/InternLM/xtuner/assets/67539920/abacb28f-5afd-46d0-91b2-acdd20887969 73 | .. |image2| image:: https://github.com/InternLM/xtuner/assets/41630003/fc16387a-3c83-4015-9235-8ec811077953 74 | -------------------------------------------------------------------------------- /docs/zh_cn/user_guides/ceph.md: -------------------------------------------------------------------------------- 1 | ## 功能说明 2 | 3 | ### 已支持的功能 4 | 5 | - 保存 DeepSpeed Checkpoint 至 CEPH 6 | - 从 Ceph 上的 DeepSpeed Checkpoint 续训 7 | - `pth_to_hf` 支持 Ceph 上的 DeepSpeed Checkpoint 8 | 9 | ### 暂不支持的功能 10 | 11 | - 训练时从 Ceph 加载 Huggingface 模型, 与 `zero3` 加载权重冲突 12 | - HuggingFace `save_pretrained` 保存至 Ceph, 逻辑过于复杂,没办法 patch 13 | 14 | ## 使用说明 15 | 16 | #### 1. 验证 ceph 环境 17 | 18 | 使用前需确保 `petrel sdk` 可用,并且要使用的 Ceph bucket 存在且可用 19 | 20 | 验证 `aws` 命令行工具 21 | 22 | ```bash 23 | # 验证 aws 命令行工具 24 | aws s3 ls $YOUR_BUCKET 25 | ``` 26 | 27 | 验证 `petrel sdk` 28 | 29 | ```python 30 | bucket = 's3://xxx' 31 | 32 | from mmengine import get_file_backend 33 | backend = get_file_backend(bucket) 34 | 35 | for f in backend.list_dir_or_file(bucket): 36 | print(f) 37 | ``` 38 | 39 | #### 2. 训练时保存 Checkpoint 至 Ceph 40 | 41 | `XTuner` 根据环境变量 `DS_CEPH_DIR` 来判断是否将 checkpoint 保存至 ceph 42 | 43 | ```bash 44 | DS_CEPH_DIR=s3://xxxx srun ${SRUN_ARGS} xtuner train $CONFIG --launcher slurm 45 | ``` 46 | 47 | #### 3. 从 Ceph 上的 Checkpoint 续训 48 | 49 | Resume 时,要填写 checkpoint 在 ceph 上的完整路径 50 | 51 | ```bash 52 | DS_CEPH_DIR=s3://xxxx srun ${SRUN_ARGS} xtuner train $CONFIG --launcher slurm --resume s3://xxx/yyy/epoch_x.pth 53 | ``` 54 | 55 | #### 4. 将 Ceph 上的 Checkpoint 转换为 HF 模型 56 | 57 | 不支持 `$HF_DIR` 为 ceph 路径 58 | 59 | 由于 Checkpoint 中存储了优化器状态,加载比较耗时,对于 ZeRO 1&2 可以直接加载 checkpoint 中的 `model_states.pt` 文件加速转换过程;ZeRO 3 必须先加载整个 checkpoint 60 | 61 | ```bash 62 | srun ${SRUN_ARGS} xtuner convert pth_to_hf $CONFIG s3://xxx/yyy/epoch_x.pth $HF_DIR 63 | 64 | ``` 65 | -------------------------------------------------------------------------------- /docs/zh_cn/user_guides/ftdp_dataset/README.md: -------------------------------------------------------------------------------- 1 | ftdp 是一个闭源的处理数据工具,开源社区用户可以忽略此文档。 2 | 3 | 本节介绍了常见的 4 种使用 ftdp 数据集训练的使用场景: 4 | 5 | - [Case 1: 使用 Processed 数据集训练 InternLM2](Case1.md) 6 | - [Case 2: 使用 Processed 数据集训练非 InternLM2 模型](Case2.md) 7 | - [Case 3: 使用 Processed 普通对话数据集训任意模型](Case3.md) 8 | - [Case 4: 使用 Tokenized 数据集训练 InternLM2](Case4.md) 9 | 10 | 请先参考下方流程图,选择自己的使用场景。 11 | 12 | ```mermaid 13 | graph TD; 14 | A{ftdp 数据} 15 | A -->|是| B{数据 tokenized} 16 | B -->|否| C{使用 Internlm2 对话模板} 17 | C -->|是| D{训练 Internlm2 } 18 | D -->|是| E[Case 1] 19 | D -->|否| F[Case 2] 20 | C -->|否| G{离线处理数据集} 21 | G -->|是| H[尚不支持] 22 | G -->|否| I[Case 3] 23 | B -->|是| J[Case 4] 24 | ``` 25 | -------------------------------------------------------------------------------- /docs/zh_cn/user_guides/llava_offline.md: -------------------------------------------------------------------------------- 1 | # 离线处理 Llava 训练数据集 2 | 3 | 当训练数据量非常大时,每次训练的时候都先在线处理数据可能会极为耗时。我们可以先对原始数据进行离线处理并保存至本地,随后的多次训练可以读入本地离线处理好的数据后直接开始训练。 4 | 5 | ## Step 1, 导出模板 config 文件 6 | 7 | 可使用以下命令查看 XTuner 中提供的 Llava 训练相关的 config: 8 | 9 | ``` 10 | xtuner list-cfg -p llava 11 | ``` 12 | 13 | 找到需要使用的 config 文件并导出至当前目录下: 14 | 15 | ``` 16 | xtuner copy-cfg ${CONFIG_NAME} . 17 | ``` 18 | 19 | ## Step 2, 离线处理数据集 20 | 21 | 使用以下命令可离线处理训练数据集中的文本数据: 22 | 23 | ``` 24 | python xtuner/tools/process_untokenized_llava_data.py \ 25 | ${CONFIG_PATH} \ 26 | --save-folder /folder/to/save/processed/dataset 27 | ``` 28 | 29 | 其中,${CONFIG_PATH} 为第一步中导出的 config 文件路径,`/folder/to/save/processed/dataset` 则需要指定为离线处理数据的保存路径。 30 | 31 | ## Step 3, 修改 config 文件 32 | 33 | 对 Step 1 中导出的 config 文件做如下修改: 34 | 35 | ```diff 36 | ####################################################################### 37 | # PART 3 Dataset & Dataloader # 38 | ####################################################################### 39 | llava_dataset = dict( 40 | - data_path=data_path, 41 | - tokenizer=tokenizer, 42 | + offline_processed_text_folder=/folder/to/save/processed/dataset 43 | ...) 44 | ``` 45 | 46 | 其中,`/folder/to/save/processed/dataset` 为 Step 2 保存的离线处理数据路径。 47 | 48 | ## Step 4,开始训练 49 | 50 | 使用 Step 3 修改得到的 config 训练即可。 51 | -------------------------------------------------------------------------------- /examples/demo_data/multi_turn_1/data.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "messages":[ 3 | { 4 | "toy_system": "You are a helpful AI assistant.", 5 | "toy_input": "Give three tips for staying healthy.", 6 | "toy_output": "1.Eat a balanced diet. 2. Exercise regularly. 3. Get enough sleep." 7 | }, 8 | { 9 | "toy_input": "How to study English?", 10 | "toy_output": "1. Set clear goals. 2. Create a study plan. 3. Build vocabulary. 4. Practice speaking." 11 | } 12 | ] 13 | }, 14 | { 15 | "messages":[ 16 | { 17 | "toy_system": "You are a helpful AI assistant.", 18 | "toy_input": "How to study English?", 19 | "toy_output": "1. Set clear goals. 2. Create a study plan. 3. Build vocabulary. 4. Practice speaking." 20 | }, 21 | { 22 | "toy_input": "Give three tips for staying healthy.", 23 | "toy_output": "1.Eat a balanced diet. 2. Exercise regularly. 3. Get enough sleep." 24 | } 25 | ] 26 | }] 27 | -------------------------------------------------------------------------------- /examples/demo_data/multi_turn_1/map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def multi_turn_1_map_fn(example): 3 | messages = example["messages"] 4 | conversation = [] 5 | for msg in messages: 6 | conversation.append( 7 | { 8 | "system": msg["toy_system"], 9 | "input": msg["toy_input"], 10 | "output": msg["toy_output"], 11 | } 12 | ) 13 | return {"conversation": conversation} 14 | -------------------------------------------------------------------------------- /examples/demo_data/multi_turn_2/data.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "messages":[ 3 | { 4 | "role": "system", 5 | "content": "You are a helpful AI assistant." 6 | }, 7 | { 8 | "role": "user", 9 | "content": "Give three tips for staying healthy." 10 | }, 11 | { 12 | "role": "assistant", 13 | "content": "1.Eat a balanced diet. 2. Exercise regularly. 3. Get enough sleep." 14 | }, 15 | { 16 | "role": "user", 17 | "content": "How to study English?" 18 | }, 19 | { 20 | "role": "assistant", 21 | "content": "1. Set clear goals. 2. Create a study plan. 3. Build vocabulary. 4. Practice speaking." 22 | } 23 | ] 24 | }, 25 | { 26 | "messages":[ 27 | { 28 | "role": "system", 29 | "content": "You are a helpful AI assistant." 30 | }, 31 | { 32 | "role": "user", 33 | "content": "How to study English?" 34 | }, 35 | { 36 | "role": "assistant", 37 | "content": "1. Set clear goals. 2. Create a study plan. 3. Build vocabulary. 4. Practice speaking." 38 | }, 39 | { 40 | "role": "user", 41 | "content": "Give three tips for staying healthy." 42 | }, 43 | { 44 | "role": "assistant", 45 | "content": "1.Eat a balanced diet. 2. Exercise regularly. 3. Get enough sleep." 46 | } 47 | ] 48 | }] 49 | -------------------------------------------------------------------------------- /examples/demo_data/multi_turn_2/map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def multi_turn_2_map_fn(example): 3 | messages = example["messages"] 4 | system = "" 5 | input = "" 6 | conversation = [] 7 | while messages and messages[0]["role"] == "assistant": 8 | # Skip the first one if it is from assistant 9 | messages = messages[1:] 10 | for msg in messages: 11 | if msg["role"] == "system": 12 | system = msg["content"] 13 | elif msg["role"] == "user": 14 | input += msg["content"] 15 | elif msg["role"] == "assistant": 16 | conversation.append( 17 | {"system": system, "input": input, "output": msg["content"]} 18 | ) 19 | system = "" 20 | input = "" 21 | else: 22 | raise NotImplementedError 23 | return {"conversation": conversation} 24 | -------------------------------------------------------------------------------- /examples/demo_data/pretrain/data.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "toy_text": "I am an artificial intelligence (AI) assistant named InternLM. I was created by the Shanghai AI Laboratory and my purpose is to assist users with various tasks through natural language processing technology." 3 | }, 4 | { 5 | "toy_text": "I am an artificial intelligence programmed to assist with various types of tasks, including answering questions, providing information, and performing automated processes." 6 | }] 7 | -------------------------------------------------------------------------------- /examples/demo_data/pretrain/map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def pretrain_map_fn(example): 3 | return {"conversation": [{"input": "", "output": example["toy_text"].strip()}]} 4 | -------------------------------------------------------------------------------- /examples/demo_data/single_turn/data.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "toy_system": "You are a helpful AI assistant.", 3 | "toy_input": "Give three tips for staying healthy.", 4 | "toy_output": "1.Eat a balanced diet. 2. Exercise regularly. 3. Get enough sleep." 5 | }, 6 | { 7 | "toy_system": "You are a helpful AI assistant.", 8 | "toy_input": "How to study English?", 9 | "toy_output": "1. Set clear goals. 2. Create a study plan. 3. Build vocabulary. 4. Practice speaking." 10 | }] 11 | -------------------------------------------------------------------------------- /examples/demo_data/single_turn/map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def single_turn_map_fn(example): 3 | return { 4 | "conversation": [ 5 | { 6 | "system": example["toy_system"], 7 | "input": example["toy_input"], 8 | "output": example["toy_output"], 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /examples/huggingface_trainer/README.md: -------------------------------------------------------------------------------- 1 | # How to use XTuner in HuggingFace training pipeline 2 | 3 | ## Quick run 4 | 5 | 1. step in `examples` 6 | 7 | ```shell 8 | cd ./examples 9 | ``` 10 | 11 | 2. run training scripts 12 | 13 | ```shell 14 | # qlora-training internlm-7b with alpaca dataset 15 | python train_qlora_hf.py --model_name_or_path internlm/internlm-7b --dataset_name_or_path tatsu-lab/alpaca 16 | ``` 17 | 18 | `--model_name_or_path`: specify the model name or path to train. 19 | 20 | `--dataset_name_or_path`: specify the dataset name or path to use. 21 | 22 | ## How to customize your experiment 23 | 24 | XTuner APIs are compatible with the usage of HuggingFace's transformers. 25 | If you want to customize your experiment, you just need to pass in your hyperparameters like HuggingFace. 26 | 27 | ``` 28 | # training example 29 | python train_qlora_hf.py \ 30 | # custom training args 31 | --model_name_or_path internlm/internlm-7b \ 32 | --dataset_name_or_path tatsu-lab/alpaca \ 33 | # HuggingFace's default training args 34 | --do_train = True 35 | --per_device_train_batch_size = 1 36 | --learning_rate = 2e-5 37 | --save_strategy = 'epoch' 38 | --lr_scheduler_type = 'cosine' 39 | --logging_steps = 1 40 | ``` 41 | -------------------------------------------------------------------------------- /examples/huggingface_trainer/train_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import transformers 3 | from transformers import Trainer 4 | 5 | from xtuner.apis import DefaultTrainingArguments, build_model 6 | from xtuner.apis.datasets import alpaca_data_collator, alpaca_dataset 7 | 8 | 9 | def train(): 10 | # get DefaultTrainingArguments and to be updated with passed args 11 | parser = transformers.HfArgumentParser(DefaultTrainingArguments) 12 | training_args = parser.parse_args_into_dataclasses()[0] 13 | 14 | # init model and dataset 15 | model, tokenizer = build_model( 16 | model_name_or_path=training_args.model_name_or_path, return_tokenizer=True 17 | ) 18 | train_dataset = alpaca_dataset( 19 | tokenizer=tokenizer, path=training_args.dataset_name_or_path 20 | ) 21 | data_collator = alpaca_data_collator(return_hf_format=True) 22 | 23 | # build trainer 24 | trainer = Trainer( 25 | model=model, 26 | args=training_args, 27 | train_dataset=train_dataset, 28 | data_collator=data_collator, 29 | ) 30 | 31 | # training 32 | trainer.train() 33 | 34 | trainer.save_state() 35 | trainer.save_model(output_dir=training_args.output_dir) 36 | 37 | 38 | if __name__ == "__main__": 39 | train() 40 | -------------------------------------------------------------------------------- /examples/huggingface_trainer/train_lora_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import transformers 3 | from transformers import Trainer 4 | 5 | from xtuner.apis import DefaultTrainingArguments, build_lora_model 6 | from xtuner.apis.datasets import alpaca_data_collator, alpaca_dataset 7 | 8 | 9 | def train(): 10 | # get DefaultTrainingArguments and to be updated with passed args 11 | parser = transformers.HfArgumentParser(DefaultTrainingArguments) 12 | training_args = parser.parse_args_into_dataclasses()[0] 13 | 14 | # init model and dataset 15 | model, tokenizer = build_lora_model( 16 | model_name_or_path=training_args.model_name_or_path, return_tokenizer=True 17 | ) 18 | train_dataset = alpaca_dataset( 19 | tokenizer=tokenizer, path=training_args.dataset_name_or_path 20 | ) 21 | data_collator = alpaca_data_collator(return_hf_format=True) 22 | 23 | # build trainer 24 | trainer = Trainer( 25 | model=model, 26 | args=training_args, 27 | train_dataset=train_dataset, 28 | data_collator=data_collator, 29 | ) 30 | 31 | # training 32 | trainer.train() 33 | 34 | trainer.save_state() 35 | trainer.save_model(output_dir=training_args.output_dir) 36 | 37 | 38 | if __name__ == "__main__": 39 | train() 40 | -------------------------------------------------------------------------------- /examples/huggingface_trainer/train_qlora_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import transformers 3 | from transformers import Trainer 4 | 5 | from xtuner.apis import DefaultTrainingArguments, build_qlora_model 6 | from xtuner.apis.datasets import alpaca_data_collator, alpaca_dataset 7 | 8 | 9 | def train(): 10 | # get DefaultTrainingArguments and to be updated with passed args 11 | parser = transformers.HfArgumentParser(DefaultTrainingArguments) 12 | training_args = parser.parse_args_into_dataclasses()[0] 13 | 14 | # init model and dataset 15 | model, tokenizer = build_qlora_model( 16 | model_name_or_path=training_args.model_name_or_path, return_tokenizer=True 17 | ) 18 | train_dataset = alpaca_dataset( 19 | tokenizer=tokenizer, path=training_args.dataset_name_or_path 20 | ) 21 | data_collator = alpaca_data_collator(return_hf_format=True) 22 | 23 | # build trainer 24 | trainer = Trainer( 25 | model=model, 26 | args=training_args, 27 | train_dataset=train_dataset, 28 | data_collator=data_collator, 29 | ) 30 | 31 | # training 32 | trainer.train() 33 | 34 | trainer.save_state() 35 | trainer.save_model(output_dir=training_args.output_dir) 36 | 37 | 38 | if __name__ == "__main__": 39 | train() 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/runtime.txt 2 | -r requirements/deepspeed.txt 3 | -r requirements/modelscope.txt 4 | -------------------------------------------------------------------------------- /requirements/deepspeed.txt: -------------------------------------------------------------------------------- 1 | deepspeed==0.16.2 2 | mpi4py-mpich 3 | -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | docutils 2 | myst-parser==2.0.0 3 | sphinx==6.2.1 4 | sphinx-argparse 5 | sphinx-book-theme==1.0.1 6 | sphinx-copybutton==0.5.2 7 | sphinx_markdown_tables 8 | -------------------------------------------------------------------------------- /requirements/lmdeploy.txt: -------------------------------------------------------------------------------- 1 | lmdeploy>=0.6.2 --no-deps 2 | -------------------------------------------------------------------------------- /requirements/modelscope.txt: -------------------------------------------------------------------------------- 1 | modelscope 2 | -------------------------------------------------------------------------------- /requirements/runtime.txt: -------------------------------------------------------------------------------- 1 | bitsandbytes==0.45.0 2 | datasets>=3.2.0 3 | einops 4 | loguru 5 | mmengine==0.10.6 6 | openpyxl 7 | peft>=0.14.0 8 | scikit-image 9 | scipy 10 | SentencePiece 11 | tiktoken 12 | torch 13 | torchvision 14 | transformers==4.48.0 15 | transformers_stream_generator 16 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [codespell] 2 | ignore-words-list = nd, ba, warmup, ans 3 | 4 | [flake8] 5 | max-line-length = 119 6 | ignore = D107,D202,D203,D401,E203,W503 7 | inline-quotes = double 8 | 9 | [black] 10 | line-length = 119 11 | 12 | [isort] 13 | profile = black 14 | 15 | 16 | -------------------------------------------------------------------------------- /xtuner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | from mmengine.utils import digit_version 5 | 6 | from .entry_point import cli 7 | from .version import __version__, version_info 8 | 9 | HF_CEPH_HUB = os.getenv("HF_CEPH_HUB", "") 10 | HF_USE_CEPH = os.getenv("HF_USE_CEPH", 0) or HF_CEPH_HUB != "" 11 | DS_CEPH_DIR = os.getenv("DS_CEPH_DIR", None) 12 | if HF_USE_CEPH: 13 | from .utils.fileio import patch_hf_auto_from_pretrained, patch_hf_save_pretrained 14 | 15 | patch_hf_auto_from_pretrained(HF_CEPH_HUB) 16 | patch_hf_save_pretrained() 17 | 18 | if DS_CEPH_DIR: 19 | from .utils.fileio import patch_deepspeed_engine 20 | 21 | patch_deepspeed_engine() 22 | 23 | __all__ = [ 24 | "__version__", 25 | "version_info", 26 | "digit_version", 27 | "cli", 28 | "HF_USE_CEPH", 29 | "DS_CEPH_DIR", 30 | ] 31 | -------------------------------------------------------------------------------- /xtuner/_lite/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | import subprocess 4 | import sys 5 | 6 | from loguru import logger 7 | 8 | from .device import get_device, get_torch_device_module 9 | 10 | _LOGGER = None 11 | 12 | 13 | def log_format(debug=False): 14 | formatter = "[XTuner][{time:YYYY-MM-DD HH:mm:ss}][{level}]" 15 | 16 | if debug: 17 | formatter += "[{name}:" 18 | formatter += "{function}:" 19 | formatter += "{line}]" 20 | 21 | formatter += " {message}" 22 | return formatter 23 | 24 | 25 | def get_logger(level="INFO"): 26 | global _LOGGER 27 | if _LOGGER is None: 28 | # Remove the original logger in Python to prevent duplicate printing. 29 | logger.remove() 30 | logger.add(sys.stderr, level=level, format=log_format(debug=level == "DEBUG")) 31 | _LOGGER = logger 32 | return _LOGGER 33 | 34 | 35 | def get_repo_git_info(repo_path): 36 | original_directory = os.getcwd() 37 | os.chdir(repo_path) 38 | 39 | try: 40 | branch = ( 41 | subprocess.check_output( 42 | ["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.STDOUT 43 | ) 44 | .strip() 45 | .decode("utf-8") 46 | ) 47 | 48 | commit_id = ( 49 | subprocess.check_output( 50 | ["git", "rev-parse", "HEAD"], stderr=subprocess.STDOUT 51 | ) 52 | .strip() 53 | .decode("utf-8") 54 | ) 55 | 56 | remote_url = ( 57 | subprocess.check_output( 58 | ["git", "remote", "get-url", "origin"], stderr=subprocess.STDOUT 59 | ) 60 | .strip() 61 | .decode("utf-8") 62 | ) 63 | 64 | return branch, commit_id, remote_url 65 | except subprocess.CalledProcessError: 66 | return None, None, None 67 | finally: 68 | os.chdir(original_directory) 69 | 70 | 71 | __all__ = [ 72 | "AutoConfig", 73 | "AutoModelForCausalLM", 74 | "AutoTokenizer", 75 | "get_device", 76 | "get_torch_device_module", 77 | ] 78 | -------------------------------------------------------------------------------- /xtuner/_lite/accelerate/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .lora import LORA_TARGET_MAP 3 | from .packed import pack_sequence, unpack_sequence 4 | from .utils import ( 5 | liger_kernel_is_available, 6 | lmdeploy_is_available, 7 | mlu_is_available, 8 | npu_is_available, 9 | profile_time_and_memory, 10 | varlen_attn_is_available, 11 | ) 12 | 13 | __all__ = [ 14 | "LORA_TARGET_MAP", 15 | "pack_sequence", 16 | "packed_sequence", 17 | "unpack_sequence", 18 | "liger_kernel_is_available", 19 | "varlen_attn_is_available", 20 | "lmdeploy_is_available", 21 | "npu_is_available", 22 | "mlu_is_available", 23 | "profile_time_and_memory", 24 | ] 25 | -------------------------------------------------------------------------------- /xtuner/_lite/accelerate/lora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | LORA_TARGET_MAP = { 3 | "InternLM2ForCausalLM": ["wqkv", "wo", "w1", "w2", "w3"], 4 | "CLIPVisionModel": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 5 | } 6 | -------------------------------------------------------------------------------- /xtuner/_lite/accelerate/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .moe_permute import GROUPED_GEMM_INSTALLED, permute_func, unpermute_func 3 | 4 | __all__ = ["GROUPED_GEMM_INSTALLED", "permute_func", "unpermute_func"] 5 | -------------------------------------------------------------------------------- /xtuner/_lite/accelerate/packed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import List, Union 3 | 4 | import torch 5 | 6 | 7 | def unpack_sequence(packed: torch.Tensor, num_tokens: Union[torch.Tensor, List], dim=1): 8 | if isinstance(num_tokens, torch.Tensor): 9 | num_tokens = num_tokens.tolist() 10 | sequences = torch.split(packed, num_tokens, dim=dim) 11 | return sequences 12 | 13 | 14 | def pack_sequence(sequences, dim=1): 15 | num_tokens = torch.IntTensor([seq.size(dim) for seq in sequences]) 16 | packed = torch.cat(sequences, dim=dim) 17 | return packed, num_tokens.to(packed.device) 18 | 19 | 20 | def packed_cumulative_length(num_tokens: torch.Tensor): 21 | device = num_tokens.device 22 | _zero_pad = torch.zeros(1, device=device) 23 | _pad_length = torch.cat([_zero_pad, num_tokens]).int() 24 | return torch.cumsum(_pad_length, 0).int() 25 | -------------------------------------------------------------------------------- /xtuner/_lite/accelerate/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import time 3 | from contextlib import contextmanager 4 | 5 | from transformers.utils.import_utils import is_flash_attn_2_available 6 | 7 | from xtuner._lite import get_device, get_logger, get_torch_device_module 8 | 9 | logger = get_logger() 10 | 11 | 12 | def npu_is_available(): 13 | return get_device() == "npu" 14 | 15 | 16 | def mlu_is_available(): 17 | return get_device() == "mlu" 18 | 19 | 20 | def varlen_attn_is_available(): 21 | return is_flash_attn_2_available() or npu_is_available() 22 | 23 | 24 | def lmdeploy_is_available(): 25 | available = False 26 | try: 27 | import lmdeploy # noqa: F401 28 | 29 | available = True 30 | except ImportError: 31 | available = False 32 | 33 | return available 34 | 35 | 36 | def liger_kernel_is_available(): 37 | available = False 38 | try: 39 | import liger_kernel # noqa: F401 40 | 41 | available = True 42 | except ImportError: 43 | available = False 44 | 45 | return available 46 | 47 | 48 | @contextmanager 49 | def profile_time_and_memory(desc): 50 | torch_device = get_torch_device_module() 51 | start_t = time.time() 52 | torch_device.reset_peak_memory_stats() 53 | 54 | yield 55 | 56 | max_memory = torch_device.max_memory_allocated() 57 | cost_time = time.time() - start_t 58 | 59 | logger.success( 60 | f"{desc} Elapsed time {cost_time:.2f} seconds, " 61 | f"peak gpu memory {max_memory/1024**3:.1f}G" 62 | ) 63 | -------------------------------------------------------------------------------- /xtuner/_lite/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /xtuner/_lite/algorithms/ppo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dataset import ( 3 | InferDataset, 4 | PPOTokenizeFunction, 5 | RewardBuffer, 6 | RewardBufferCollator, 7 | ) 8 | from .loss import ( 9 | CriticLoss, 10 | PPOPolicyLoss, 11 | compute_advantages_and_returns, 12 | compute_kl_rewards, 13 | gather_logprobs, 14 | ) 15 | from .model import build_actor_model, build_reward_model 16 | 17 | __all__ = [ 18 | "InferDataset", 19 | "RewardBuffer", 20 | "RewardBufferCollator", 21 | "PPOCollator", 22 | "PPODataset", 23 | "PPOTokenizeFunction", 24 | "CriticLoss", 25 | "PPOPolicyLoss", 26 | "compute_advantages_and_returns", 27 | "compute_kl_rewards", 28 | "compute_rewards", 29 | "gather_logprobs", 30 | "build_actor_model", 31 | "build_reward_model", 32 | ] 33 | -------------------------------------------------------------------------------- /xtuner/_lite/algorithms/ppo/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | from transformers.utils.import_utils import ( 5 | is_flash_attn_2_available, 6 | is_torch_sdpa_available, 7 | ) 8 | 9 | from xtuner._lite.accelerate import LoadWoInit 10 | 11 | 12 | def build_actor_model(model_path, dtype=torch.float32, trust_remote_code=True): 13 | config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) 14 | if is_flash_attn_2_available(): 15 | config.attn_implementation = "flash_attention_2" 16 | elif is_torch_sdpa_available(): 17 | config.attn_implementation = "sdpa" 18 | 19 | with LoadWoInit(): 20 | policy = AutoModelForCausalLM.from_pretrained( 21 | model_path, 22 | attn_implementation="flash_attention_2", 23 | torch_dtype=dtype, 24 | trust_remote_code=trust_remote_code, 25 | ) 26 | 27 | return policy 28 | 29 | 30 | def build_reward_model(model_path, dtype=torch.float32, trust_remote_code=True): 31 | config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) 32 | if is_flash_attn_2_available(): 33 | config.attn_implementation = "flash_attention_2" 34 | elif is_torch_sdpa_available(): 35 | config.attn_implementation = "sdpa" 36 | 37 | config.use_cache = False 38 | config.torch_dtype = dtype 39 | with LoadWoInit(): 40 | reward = AutoModel.from_pretrained( 41 | model_path, 42 | attn_implementation="flash_attention_2", 43 | torch_dtype=dtype, 44 | trust_remote_code=trust_remote_code, 45 | ) 46 | 47 | reward.model.use_cache = False 48 | 49 | return reward 50 | -------------------------------------------------------------------------------- /xtuner/_lite/algorithms/sft/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dataset import SftCollator, SftTokenizeFunction 3 | 4 | __all__ = ["SftCollator", "SftTokenizeFunction"] 5 | -------------------------------------------------------------------------------- /xtuner/_lite/chat/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .messages import ChatMessages 3 | from .templates import CHAT_TEMPLATE_MAP, ChatTemplate, HybridChatTemplate 4 | 5 | __all__ = ["ChatMessages", "CHAT_TEMPLATE_MAP", "ChatTemplate", "HybridChatTemplate"] 6 | -------------------------------------------------------------------------------- /xtuner/_lite/chat/backends/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /xtuner/_lite/chat/messages/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import BaseMessages 3 | from .chat import ChatMessages 4 | 5 | __all__ = ["BaseMessages", "ChatMessages"] 6 | -------------------------------------------------------------------------------- /xtuner/_lite/chat/messages/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import abstractclassmethod, abstractmethod 3 | from typing import Dict 4 | 5 | from pydantic import BaseModel 6 | from transformers import PreTrainedTokenizer 7 | 8 | from ..templates import ChatTemplate 9 | 10 | 11 | class BaseMessages(BaseModel): 12 | @abstractmethod 13 | def add(self, role: str, content): 14 | pass 15 | 16 | @abstractmethod 17 | def pop(self): 18 | pass 19 | 20 | @abstractmethod 21 | def get_prompt(self, chat_template: ChatTemplate) -> str: 22 | pass 23 | 24 | @abstractmethod 25 | def tokenize( 26 | self, tokenizer: PreTrainedTokenizer, chat_template: ChatTemplate 27 | ) -> Dict: 28 | pass 29 | 30 | @abstractclassmethod 31 | def from_dict(cls, item: Dict) -> "BaseMessages": 32 | pass 33 | -------------------------------------------------------------------------------- /xtuner/_lite/chat/templates/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .chat import ChatTemplate 3 | from .hybrid import HybridChatTemplate 4 | 5 | CHAT_TEMPLATE_MAP = { 6 | "internlm2": HybridChatTemplate( 7 | system="<|im_start|>system\n{system}<|im_end|>\n", 8 | user="<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n", 9 | assistant="{assistant}<|im_end|>", 10 | stop_words=["<|im_end|>"], 11 | ), 12 | "qwen2": HybridChatTemplate( 13 | system="<|im_start|>system\n{system}<|im_end|>\n", 14 | user="<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n", 15 | assistant="{assistant}<|im_end|>", 16 | stop_words=["<|im_end|>", "<|endoftext|>"], 17 | ), 18 | "llama3": HybridChatTemplate( 19 | system=("<|start_header_id|>system<|end_header_id|>\n\n{system}" "<|eot_id|>"), 20 | user=( 21 | "<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|>" 22 | "<|start_header_id|>assistant<|end_header_id|>\n\n" 23 | ), 24 | assistant="{assistant}<|eot_id|>", 25 | sep="", 26 | stop_words=["<|eot_id|>"], 27 | ), 28 | } 29 | 30 | __all__ = ["ChatTemplate", "HybridChatTemplate"] 31 | -------------------------------------------------------------------------------- /xtuner/_lite/chat/templates/chat.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import List 3 | 4 | from pydantic import BaseModel, field_validator 5 | 6 | 7 | class ChatTemplate(BaseModel): 8 | """Define a Pydantic data model for a hybrid chat with attributes for 9 | system, user and assistant chat as well as function and interpreter calls 10 | and results.""" 11 | 12 | # Normal Chat 13 | system: str # System message format 14 | user: str # User message format 15 | assistant: str # Assistant message format 16 | stop_words: List[str] # List of stop words 17 | sep: str = "\n" 18 | 19 | def decorate_system(self, text: str) -> str: 20 | """Decorate text with the `system` template.""" 21 | return self.system.format(system=text) 22 | 23 | def decorate_assistant(self, text: str) -> str: 24 | """Decorate text with the `assistant` template.""" 25 | return self.assistant.format(assistant=text) 26 | 27 | def decorate_user(self, text: str) -> str: 28 | """Decorate text with the `user` template.""" 29 | return self.user.format(user=text) 30 | 31 | @field_validator("system") 32 | def check_system(cls, v: str) -> str: 33 | """Validate that `system` contains '{system}'. 34 | 35 | If not, raises a ValueError. 36 | """ 37 | if v is not None and "{system}" not in v: 38 | raise ValueError("system must contain the keyword '{system}'") 39 | return v 40 | 41 | @field_validator("user") 42 | def check_user(cls, v: str) -> str: 43 | """Validate that `user` contains '{user}'. 44 | 45 | If not, raises a ValueError. 46 | """ 47 | if v is not None and "{user}" not in v: 48 | raise ValueError("user must contain the keyword '{user}'") 49 | return v 50 | 51 | @field_validator("assistant") 52 | def check_assistant(cls, v: str) -> str: 53 | """Validate that `assistant` contains '{assistant}'. 54 | 55 | If not, raises a ValueError. 56 | """ 57 | if v is not None and "{assistant}" not in v: 58 | raise ValueError("assistant must contain the keyword '{assistant}'") 59 | return v 60 | -------------------------------------------------------------------------------- /xtuner/_lite/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .json import JsonDataset 3 | from .jsonl import JsonlDataset 4 | from .pack import SoftPackDataset 5 | from .utils import DATASET_CLS_MAP, OPENAI_CONVERT_MAP, load_datasets 6 | 7 | __all__ = [ 8 | "JsonDataset", 9 | "JsonlDataset", 10 | "SoftPackDataset", 11 | "DATASET_CLS_MAP", 12 | "OPENAI_CONVERT_MAP", 13 | "load_datasets", 14 | ] 15 | -------------------------------------------------------------------------------- /xtuner/_lite/datasets/streaming.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | 4 | class Streaming: 5 | def __init__(self, file, max_epoch=1): 6 | self.file = file 7 | self.offset = 0 8 | self.epoch = 1 9 | self.max_epoch = max_epoch 10 | 11 | def __iter__(self): 12 | return self 13 | 14 | def __next__(self): 15 | with open(self.file) as f: 16 | f.seek(self.offset) 17 | line = f.readline() 18 | 19 | if not line and self.epoch < self.max_epoch: 20 | self.offset = 0 21 | self.epoch += 1 22 | return next(self) 23 | 24 | elif not line and self.epoch == self.max_epoch: 25 | raise StopIteration 26 | 27 | self.offset = f.tell() 28 | return line 29 | -------------------------------------------------------------------------------- /xtuner/_lite/datasets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .convert import OPENAI_CONVERT_MAP 3 | from .load import DATASET_CLS_MAP, load_datasets 4 | from .utils import apply_exif_orientation, move_data_to_device 5 | 6 | __all__ = [ 7 | "OPENAI_CONVERT_MAP", 8 | "DATASET_CLS_MAP", 9 | "load_datasets", 10 | "apply_exif_orientation", 11 | "move_data_to_device", 12 | ] 13 | -------------------------------------------------------------------------------- /xtuner/_lite/datasets/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from collections.abc import Mapping 3 | 4 | import torch 5 | from PIL import Image 6 | 7 | _EXIF_ORIENT = 274 # exif 'Orientation' tag 8 | 9 | 10 | def apply_exif_orientation(image): 11 | """Applies the exif orientation correctly. 12 | 13 | This code exists per the bug: 14 | https://github.com/python-pillow/Pillow/issues/3973 15 | with the function `ImageOps.exif_transpose`. The Pillow source raises errors with 16 | various methods, especially `tobytes` 17 | 18 | Function based on: 19 | https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59 20 | https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527 21 | 22 | Args: 23 | image (PIL.Image): a PIL image 24 | 25 | Returns: 26 | (PIL.Image): the PIL image with exif orientation applied, if applicable 27 | """ 28 | if not hasattr(image, "getexif"): 29 | return image 30 | 31 | try: 32 | exif = image.getexif() 33 | except Exception: # https://github.com/facebookresearch/detectron2/issues/1885 34 | exif = None 35 | 36 | if exif is None: 37 | return image 38 | 39 | orientation = exif.get(_EXIF_ORIENT) 40 | 41 | method = { 42 | 2: Image.FLIP_LEFT_RIGHT, 43 | 3: Image.ROTATE_180, 44 | 4: Image.FLIP_TOP_BOTTOM, 45 | 5: Image.TRANSPOSE, 46 | 6: Image.ROTATE_270, 47 | 7: Image.TRANSVERSE, 48 | 8: Image.ROTATE_90, 49 | }.get(orientation) 50 | 51 | if method is not None: 52 | return image.transpose(method) 53 | return image 54 | 55 | 56 | def move_data_to_device(data, device="cuda"): 57 | """Prepares one `data` before feeding it to the model, be it a tensor or a 58 | nested list/dictionary of tensors.""" 59 | if isinstance(data, Mapping): 60 | return type(data)({k: move_data_to_device(v) for k, v in data.items()}) 61 | elif isinstance(data, (tuple, list)): 62 | return type(data)(move_data_to_device(v) for v in data) 63 | elif isinstance(data, torch.Tensor): 64 | kwargs = {"device": device} 65 | return data.to(non_blocking=True, **kwargs) 66 | return data 67 | -------------------------------------------------------------------------------- /xtuner/_lite/device.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | 5 | def get_device(): 6 | device = None 7 | if torch.cuda.is_available(): 8 | device = "cuda" 9 | else: 10 | try: 11 | import torch_npu # noqa: F401 12 | 13 | device = "npu" 14 | except ImportError: 15 | pass 16 | try: 17 | import torch_mlu # noqa: F401 18 | 19 | device = "mlu" 20 | except ImportError: 21 | pass 22 | 23 | if device is None: 24 | raise NotImplementedError( 25 | "Supports only CUDA or NPU. If your device is CUDA or NPU, " 26 | "please make sure that your environmental settings are " 27 | "configured correctly." 28 | ) 29 | 30 | return device 31 | 32 | 33 | def get_torch_device_module(): 34 | device = get_device() 35 | if device == "cuda": 36 | return torch.cuda 37 | elif device == "npu": 38 | return torch.npu 39 | elif device == "mlu": 40 | return torch.mlu 41 | else: 42 | raise NotImplementedError 43 | -------------------------------------------------------------------------------- /xtuner/_lite/modelings/__init__.py: -------------------------------------------------------------------------------- 1 | from .internlm2 import InternLM2Config, InternLM2ForCausalLM 2 | from .internlm3 import InternLM3Config, InternLM3ForCausalLM, InternLM3Tokenizer 3 | from .llava.modeling_llava import LlavaForConditionalGeneration 4 | from .llava.configuration_llava import EnhancedLlavaConfig 5 | from .llava.processing_llava import LlavaProcessor 6 | 7 | def register_remote_code(): 8 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer 9 | AutoConfig.register('internlm2', InternLM2Config, exist_ok=True) 10 | AutoModelForCausalLM.register( 11 | InternLM2Config, InternLM2ForCausalLM, exist_ok=True) 12 | 13 | AutoConfig.register('internlm3', InternLM3Config, exist_ok=True) 14 | AutoModelForCausalLM.register( 15 | InternLM3Config, InternLM3ForCausalLM, exist_ok=True) 16 | AutoTokenizer.register( 17 | InternLM3Config, InternLM3Tokenizer, exist_ok=True) 18 | -------------------------------------------------------------------------------- /xtuner/_lite/modelings/internlm2/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_internlm2 import InternLM2Config 2 | from .modeling_internlm2 import InternLM2ForCausalLM 3 | -------------------------------------------------------------------------------- /xtuner/_lite/modelings/internlm3/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_internlm3 import InternLM3Config 2 | from .modeling_internlm3 import InternLM3ForCausalLM 3 | from .tokenization_internlm3 import InternLM3Tokenizer 4 | -------------------------------------------------------------------------------- /xtuner/_lite/modelings/internvl2/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_intern_vit import InternVisionModel 2 | 3 | __all__ = ['InternVisionModel'] 4 | -------------------------------------------------------------------------------- /xtuner/_lite/modelings/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_llava import EnhancedLlavaConfig 2 | from .modeling_llava import LlavaForConditionalGeneration 3 | from .processing_llava import LlavaProcessor 4 | -------------------------------------------------------------------------------- /xtuner/_lite/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .comm import all_to_all, all_to_all_list, barrier 3 | from .sampler import LengthGroupedSampler, ParallelSampler, VLMLengthGroupedSampler 4 | from .sequence import * # noqa: F401, F403 5 | from .setup import setup_parallel 6 | 7 | __all__ = [ 8 | "ParallelSampler", 9 | "LengthGroupedSampler", 10 | "VLMLengthGroupedSampler", 11 | "all_to_all", 12 | "all_to_all_list", 13 | "setup_parallel", 14 | "barrier", 15 | ] 16 | -------------------------------------------------------------------------------- /xtuner/_lite/parallel/sequence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.dist import init_dist 3 | 4 | from .attention import ( 5 | post_process_for_sequence_parallel_attn, 6 | pre_process_for_sequence_parallel_attn, 7 | ) 8 | from .ops import ( 9 | gather_for_sequence_parallel, 10 | gather_forward_split_backward, 11 | split_for_sequence_parallel, 12 | split_forward_gather_backward, 13 | ) 14 | 15 | __all__ = [ 16 | "pre_process_for_sequence_parallel_attn", 17 | "post_process_for_sequence_parallel_attn", 18 | "split_for_sequence_parallel", 19 | "init_dist", 20 | "gather_for_sequence_parallel", 21 | "split_forward_gather_backward", 22 | "gather_forward_split_backward", 23 | ] 24 | -------------------------------------------------------------------------------- /xtuner/_lite/parallel/sequence/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from torch.distributed.device_mesh import DeviceMesh 4 | 5 | from ..comm import all_to_all 6 | 7 | 8 | def pre_process_for_sequence_parallel_attn( 9 | query_states: torch.Tensor, 10 | key_states: torch.Tensor, 11 | value_states: torch.Tensor, 12 | sp_mesh: DeviceMesh, 13 | scatter_dim: int = 2, 14 | gather_dim: int = 1, 15 | ): 16 | sp_size = sp_mesh.size() 17 | n_head = query_states.shape[2] 18 | assert n_head % sp_size == 0, ( 19 | "The number of attention heads should be divisible by " 20 | f"sequence_parallel_world_size. But got n_head = {n_head} and " 21 | f"sequence_parallel_world_size = {sp_size}." 22 | ) 23 | 24 | # (b, s // sp_world_size, nd, dim) -> (b, s, nd // sp_world_size, dim) 25 | sp_group = sp_mesh.get_group() 26 | query_states = all_to_all( 27 | query_states, sp_group, scatter_dim=scatter_dim, gather_dim=gather_dim 28 | ) 29 | key_states = all_to_all( 30 | key_states, sp_group, scatter_dim=scatter_dim, gather_dim=gather_dim 31 | ) 32 | value_states = all_to_all( 33 | value_states, sp_group, scatter_dim=scatter_dim, gather_dim=gather_dim 34 | ) 35 | 36 | return query_states, key_states, value_states 37 | 38 | 39 | def post_process_for_sequence_parallel_attn( 40 | attn_output: torch.Tensor, sp_mesh: DeviceMesh, scatter_dim=1, gather_dim=2 41 | ): 42 | # (b, s, nd // sp_world_size, dim) -> (b, s // sp_world_size, nd, dim) 43 | sp_group = sp_mesh.get_group() 44 | output = all_to_all( 45 | attn_output, sp_group, scatter_dim=scatter_dim, gather_dim=gather_dim 46 | ) 47 | return output 48 | -------------------------------------------------------------------------------- /xtuner/_lite/parallel/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.distributed as dist 4 | from mmengine.dist import infer_launcher, init_dist 5 | from torch._C._distributed_c10d import ReduceOp 6 | from torch.distributed.c10d_logger import _exception_logger 7 | 8 | from xtuner._lite import get_device 9 | 10 | origin_reduce_scatter_tensor = torch.distributed.reduce_scatter_tensor 11 | 12 | 13 | # mlu's reduce_scatter_tensor do not support ReduceOp.AVG, use ReduceOp.SUM / group_world_size instead. 14 | @_exception_logger 15 | def mlu_reduce_scatter_tensor( 16 | output, input, op=ReduceOp.SUM, group=None, async_op=False 17 | ): 18 | if op == ReduceOp.AVG: 19 | result = origin_reduce_scatter_tensor( 20 | output, input, ReduceOp.SUM, group, async_op 21 | ) 22 | output.div_(torch.distributed.get_world_size(group)) 23 | return result 24 | else: 25 | return origin_reduce_scatter_tensor(output, input, op, group, async_op) 26 | 27 | 28 | def setup_parallel(): 29 | if not dist.is_initialized(): 30 | dist_launcher = infer_launcher() 31 | init_dist(dist_launcher) 32 | 33 | device = get_device() 34 | 35 | if device == "mlu": 36 | torch.distributed.reduce_scatter_tensor = mlu_reduce_scatter_tensor 37 | -------------------------------------------------------------------------------- /xtuner/_lite/patches/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .auto import AutoPatch 3 | from .base import FSDPConfig 4 | from .utils import pad_to_max_length, pad_to_multiple_of 5 | 6 | __all__ = ["AutoPatch", "FSDPConfig", "pad_to_max_length", "pad_to_multiple_of"] 7 | -------------------------------------------------------------------------------- /xtuner/_lite/patches/auto.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers.models.llama import LlamaForCausalLM 3 | from transformers.models.qwen2 import Qwen2ForCausalLM 4 | 5 | from xtuner._lite.modelings.internlm3 import InternLM3ForCausalLM 6 | 7 | from .base import FSDPConfig, PatchedCausalLM 8 | from .internlm3 import ( 9 | CUDAPatchedInternLM3ForCausalLM, 10 | MLUPatchedInternLM3ForCausalLM, 11 | MuxiPatchedInternLM3ForCausalLM, 12 | ) 13 | from .llama import ( 14 | CUDAPatchedLlamaForCausalLM, 15 | MLUPatchedLlamaForCausalLM, 16 | MuxiPatchedLlamaForCausalLM, 17 | ) 18 | from .qwen2 import CUDAPatchedQwen2ForCausalLM 19 | 20 | CUDA_PATCH_MAP = { 21 | LlamaForCausalLM: CUDAPatchedLlamaForCausalLM, 22 | InternLM3ForCausalLM: CUDAPatchedInternLM3ForCausalLM, 23 | Qwen2ForCausalLM: CUDAPatchedQwen2ForCausalLM, 24 | } 25 | 26 | MLU_PATCH_MAP = { 27 | LlamaForCausalLM: MLUPatchedLlamaForCausalLM, 28 | InternLM3ForCausalLM: MLUPatchedInternLM3ForCausalLM, 29 | } 30 | 31 | MUXI_PATCH_MAP = { 32 | LlamaForCausalLM: MuxiPatchedLlamaForCausalLM, 33 | InternLM3ForCausalLM: MuxiPatchedInternLM3ForCausalLM, 34 | } 35 | 36 | 37 | class AutoPatch: 38 | @classmethod 39 | def from_causal_lm( 40 | cls, model, fsdp_config: FSDPConfig, device_type="cuda" 41 | ) -> PatchedCausalLM: 42 | if device_type == "cuda": 43 | patch_cls = CUDA_PATCH_MAP[type(model)] 44 | elif device_type == "mlu": 45 | patch_cls = MLU_PATCH_MAP[type(model)] 46 | elif device_type == "muxi": 47 | patch_cls = MUXI_PATCH_MAP[type(model)] 48 | else: 49 | raise NotImplementedError 50 | 51 | patched_model = patch_cls(model, fsdp_config) 52 | 53 | return patched_model 54 | -------------------------------------------------------------------------------- /xtuner/_lite/patches/internlm3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner._lite.chat import HybridChatTemplate 3 | from xtuner._lite.modelings.internlm3.modeling_internlm3 import ( 4 | InternLM3Attention, 5 | InternLM3DecoderLayer, 6 | InternLM3ForCausalLM, 7 | InternLM3RotaryEmbedding, 8 | ) 9 | 10 | from .llama import CUDAPatchedLlamaForCausalLM 11 | 12 | 13 | class CUDAPatchedInternLM3ForCausalLM(CUDAPatchedLlamaForCausalLM): 14 | rotary_emb_cls = InternLM3RotaryEmbedding 15 | attn_cls = InternLM3Attention 16 | layer_cls = InternLM3DecoderLayer 17 | causal_cls = InternLM3ForCausalLM 18 | 19 | chat_template = HybridChatTemplate( 20 | system="<|im_start|>system\n{system}<|im_end|>\n", 21 | user="<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n", 22 | assistant="{assistant}<|im_end|>", 23 | stop_words=["<|im_end|>"], 24 | ) 25 | 26 | def __init__(self, model, fsdp_config=None): 27 | super().__init__(model, fsdp_config) 28 | 29 | if fsdp_config.max_length is not None: 30 | self.patched_model.config.rope_scaling = {"rope_type": "default"} 31 | ori_max_len = self.patched_model.config.max_position_embeddings 32 | self.patched_model.config.max_position_embeddings = max( 33 | fsdp_config.max_length, ori_max_len 34 | ) 35 | self.patched_model.model.rotary_emb = InternLM3RotaryEmbedding( 36 | self.patched_model.config 37 | ).to(self.device_type) 38 | 39 | 40 | class MLUPatchedInternLM3ForCausalLM(CUDAPatchedInternLM3ForCausalLM): 41 | device_type = "mlu" 42 | 43 | 44 | class MuxiPatchedInternLM3ForCausalLM(CUDAPatchedInternLM3ForCausalLM): 45 | device_type = "muxi" 46 | -------------------------------------------------------------------------------- /xtuner/_lite/patches/mixins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .generate import GenerateMixin 3 | 4 | __all__ = ["GenerateMixin"] 5 | -------------------------------------------------------------------------------- /xtuner/_lite/patches/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import List, Union 3 | 4 | import torch 5 | 6 | 7 | def pad_to_multiple_of(sequence, padding_value, multiple_of, dim=-1): 8 | length = sequence.shape[dim] 9 | if length % multiple_of == 0: 10 | return sequence 11 | 12 | pad_num = multiple_of - (length % multiple_of) 13 | pad_shape = ( 14 | (*sequence.shape[:dim], pad_num, *sequence.shape[dim + 1 :]) 15 | if dim != -1 16 | else (*sequence.shape[:dim], pad_num) 17 | ) 18 | pad = torch.full( 19 | pad_shape, padding_value, dtype=sequence.dtype, device=sequence.device 20 | ) 21 | sequence = torch.cat([sequence, pad], dim=dim) 22 | return sequence 23 | 24 | 25 | def pad_to_max_length(sequence, padding_value, max_length, dim=-1): 26 | length = sequence.shape[dim] 27 | assert length <= max_length 28 | pad_num = max_length - length 29 | pad_shape = ( 30 | (*sequence.shape[:dim], pad_num, *sequence.shape[dim + 1 :]) 31 | if dim != -1 32 | else (*sequence.shape[:dim], pad_num) 33 | ) 34 | pad = torch.full( 35 | pad_shape, padding_value, dtype=sequence.dtype, device=sequence.device 36 | ) 37 | sequence = torch.cat([sequence, pad], dim=dim) 38 | return sequence 39 | 40 | 41 | def unpack_sequence(packed: torch.Tensor, num_tokens: Union[torch.Tensor, List], dim=1): 42 | if isinstance(num_tokens, torch.Tensor): 43 | num_tokens = num_tokens.tolist() 44 | sequences = torch.split(packed, num_tokens, dim=dim) 45 | return sequences 46 | 47 | 48 | def pack_sequence(sequences, dim=1): 49 | num_tokens = torch.IntTensor([seq.size(dim) for seq in sequences]) 50 | packed = torch.cat(sequences, dim=dim) 51 | return packed, num_tokens.to(packed.device) 52 | 53 | 54 | def packed_cumulative_length(num_tokens: torch.Tensor): 55 | device = num_tokens.device 56 | _zero_pad = torch.zeros(1, device=device) 57 | _pad_length = torch.cat([_zero_pad, num_tokens]).int() 58 | return torch.cumsum(_pad_length, 0).int() 59 | -------------------------------------------------------------------------------- /xtuner/apis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .datasets import * # noqa: F401, F403 3 | from .model import * # noqa: F401, F403 4 | from .training_args import * # noqa: F401, F403 5 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .alpaca import ( 3 | alpaca_data_collator, 4 | alpaca_dataset, 5 | alpaca_enzh_data_collator, 6 | alpaca_enzh_dataset, 7 | alpaca_zh_data_collator, 8 | alpaca_zh_dataset, 9 | ) 10 | from .arxiv import arxiv_data_collator, arxiv_dataset 11 | from .code_alpaca import code_alpaca_data_collator, code_alpaca_dataset 12 | from .colorist import colorist_data_collator, colorist_dataset 13 | from .lawyer import ( 14 | lawyer_crime_data_collator, 15 | lawyer_crime_dataset, 16 | lawyer_data_collator, 17 | lawyer_dataset, 18 | lawyer_reference_data_collator, 19 | lawyer_reference_dataset, 20 | ) 21 | from .medical import medical_data_collator, medical_dataset 22 | from .moss_003_sft import ( 23 | moss_003_sft_data_collator, 24 | moss_003_sft_dataset, 25 | moss_003_sft_no_plugins_data_collator, 26 | moss_003_sft_no_plugins_dataset, 27 | moss_003_sft_plugins_data_collator, 28 | moss_003_sft_plugins_dataset, 29 | ) 30 | from .oasst1 import oasst1_data_collator, oasst1_dataset 31 | from .open_orca import openorca_data_collator, openorca_dataset 32 | from .sql import sql_data_collator, sql_dataset 33 | from .tiny_codes import tiny_codes_data_collator, tiny_codes_dataset 34 | from .wizardlm import wizardlm_data_collator, wizardlm_dataset 35 | 36 | __all__ = [ 37 | "alpaca_data_collator", 38 | "alpaca_dataset", 39 | "alpaca_enzh_data_collator", 40 | "alpaca_enzh_dataset", 41 | "alpaca_zh_data_collator", 42 | "alpaca_zh_dataset", 43 | "arxiv_data_collator", 44 | "arxiv_dataset", 45 | "medical_data_collator", 46 | "medical_dataset", 47 | "moss_003_sft_data_collator", 48 | "moss_003_sft_dataset", 49 | "moss_003_sft_no_plugins_data_collator", 50 | "moss_003_sft_no_plugins_dataset", 51 | "moss_003_sft_plugins_data_collator", 52 | "moss_003_sft_plugins_dataset", 53 | "oasst1_data_collator", 54 | "oasst1_dataset", 55 | "openorca_data_collator", 56 | "openorca_dataset", 57 | "lawyer_crime_dataset", 58 | "lawyer_crime_data_collator", 59 | "lawyer_reference_dataset", 60 | "lawyer_reference_data_collator", 61 | "lawyer_dataset", 62 | "lawyer_data_collator", 63 | "colorist_dataset", 64 | "colorist_data_collator", 65 | "sql_dataset", 66 | "sql_data_collator", 67 | "code_alpaca_dataset", 68 | "code_alpaca_data_collator", 69 | "tiny_codes_dataset", 70 | "tiny_codes_data_collator", 71 | "wizardlm_data_collator", 72 | "wizardlm_dataset", 73 | ] 74 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/arxiv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def arxiv_dataset( 13 | tokenizer, 14 | data_file=None, 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=True, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv # noqa: E501 22 | # 2. Process data with `./tools/data_preprocess/arxiv.py` 23 | if data_file is None: 24 | data_file = "./data/arxiv_postprocess_csAIcsCLcsCV_20200101.json" 25 | dataset_org = load_dataset(path="json", data_files=dict(train=data_file)) 26 | dataset = process_hf_dataset( 27 | dataset=dataset_org, 28 | tokenizer=tokenizer, 29 | max_length=max_length, 30 | dataset_map_fn=arxiv_map_fn, 31 | template_map_fn=template_map_fn, 32 | remove_unused_columns=remove_unused_columns, 33 | shuffle_before_pack=True, 34 | pack_to_max_length=pack_to_max_length, 35 | ) 36 | 37 | return dataset 38 | 39 | 40 | def arxiv_data_collator(return_hf_format=False): 41 | return partial(default_collate_fn, return_hf_format=return_hf_format) 42 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/code_alpaca.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def code_alpaca_dataset( 13 | tokenizer, 14 | path="HuggingFaceH4/CodeAlpaca_20K", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=True, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=code_alpaca_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def code_alpaca_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/colorist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def colorist_dataset( 13 | tokenizer, 14 | path="burkelibbey/colors", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=True, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=colors_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def colorist_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/medical.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import medical_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def medical_dataset( 13 | tokenizer, 14 | path="shibing624/medical", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=False, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=medical_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def medical_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/moss_003_sft.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from torch.utils.data import ConcatDataset 5 | 6 | from xtuner.dataset import MOSSSFTDataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | 9 | 10 | def moss_003_sft_dataset( 11 | tokenizer, 12 | plugins_data_file=None, 13 | no_plugins_data_file=None, 14 | bot_name=None, 15 | max_length=2048, 16 | ): 17 | plugins = moss_003_sft_plugins_dataset( 18 | tokenizer, data_file=plugins_data_file, bot_name=bot_name, max_length=max_length 19 | ) 20 | no_plugins = moss_003_sft_no_plugins_dataset( 21 | tokenizer, 22 | data_file=no_plugins_data_file, 23 | bot_name=bot_name, 24 | max_length=max_length, 25 | ) 26 | dataset = ConcatDataset([plugins, no_plugins]) 27 | return dataset 28 | 29 | 30 | def moss_003_sft_data_collator(return_hf_format=False): 31 | return partial(default_collate_fn, return_hf_format=return_hf_format) 32 | 33 | 34 | def moss_003_sft_no_plugins_dataset( 35 | tokenizer, data_file=None, bot_name=None, max_length=2048 36 | ): 37 | # Download data from https://huggingface.co/datasets/fnlp/moss-003-sft-data 38 | if data_file is None: 39 | data_file = "./data/moss-003-sft-no-tools.jsonl" 40 | dataset = MOSSSFTDataset( 41 | data_file=data_file, 42 | bot_name=bot_name, 43 | tokenizer=tokenizer, 44 | max_length=max_length, 45 | ) 46 | 47 | return dataset 48 | 49 | 50 | def moss_003_sft_no_plugins_data_collator(return_hf_format=False): 51 | return partial(default_collate_fn, return_hf_format=return_hf_format) 52 | 53 | 54 | def moss_003_sft_plugins_dataset( 55 | tokenizer, data_file=None, bot_name=None, max_length=2048 56 | ): 57 | # Download data from https://huggingface.co/datasets/fnlp/moss-003-sft-data 58 | if data_file is None: 59 | data_file = "./data/conversations_with_tools_with_inner_instruction_no_text2image_train_all_random_meta0.5_0.1_0.01_moss_0709.jsonl" # noqa: E501 60 | dataset = MOSSSFTDataset( 61 | data_file=data_file, 62 | bot_name=bot_name, 63 | tokenizer=tokenizer, 64 | max_length=max_length, 65 | ) 66 | 67 | return dataset 68 | 69 | 70 | def moss_003_sft_plugins_data_collator(return_hf_format=False): 71 | return partial(default_collate_fn, return_hf_format=return_hf_format) 72 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/oasst1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def oasst1_dataset( 13 | tokenizer, 14 | path="timdettmers/openassistant-guanaco", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=False, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=oasst1_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def oasst1_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/open_orca.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import openorca_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def openorca_dataset( 13 | tokenizer, 14 | path="Open-Orca/OpenOrca", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=True, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=openorca_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def openorca_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/sql.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import sql_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def sql_dataset( 13 | tokenizer, 14 | path="b-mc2/sql-create-context", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=True, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=sql_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def sql_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/tiny_codes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import template_map_fn_factory, tiny_codes_map_fn 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def tiny_codes_dataset( 13 | tokenizer, 14 | path="nampdn-ai/tiny-codes", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=True, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=tiny_codes_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def tiny_codes_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/datasets/wizardlm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import template_map_fn_factory, wizardlm_map_fn 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def wizardlm_dataset( 13 | tokenizer, 14 | path="WizardLM/WizardLM_evol_instruct_V2_196k", 15 | max_length=2048, 16 | prompt_template=PROMPT_TEMPLATE.default, 17 | remove_unused_columns=False, 18 | pack_to_max_length=True, 19 | ): 20 | template_map_fn = template_map_fn_factory(template=prompt_template) 21 | dataset_org = load_dataset(path) 22 | dataset = process_hf_dataset( 23 | dataset=dataset_org, 24 | tokenizer=tokenizer, 25 | max_length=max_length, 26 | dataset_map_fn=wizardlm_map_fn, 27 | template_map_fn=template_map_fn, 28 | remove_unused_columns=remove_unused_columns, 29 | shuffle_before_pack=True, 30 | pack_to_max_length=pack_to_max_length, 31 | ) 32 | 33 | return dataset 34 | 35 | 36 | def wizardlm_data_collator(return_hf_format=False): 37 | return partial(default_collate_fn, return_hf_format=return_hf_format) 38 | -------------------------------------------------------------------------------- /xtuner/apis/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from peft import LoraConfig 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig 5 | 6 | from xtuner.model import SupervisedFinetune 7 | 8 | __all__ = ["build_model", "build_lora_model", "build_qlora_model"] 9 | 10 | 11 | def build_qlora_model( 12 | model_name_or_path, 13 | quantization_config=None, 14 | lora_config=None, 15 | return_tokenizer=True, 16 | ): 17 | if quantization_config is None: 18 | quantization_config = BitsAndBytesConfig( 19 | load_in_4bit=True, 20 | load_in_8bit=False, 21 | llm_int8_threshold=6.0, 22 | llm_int8_has_fp16_weight=False, 23 | bnb_4bit_compute_dtype=torch.float16, 24 | bnb_4bit_use_double_quant=True, 25 | bnb_4bit_quant_type="nf4", 26 | ) 27 | if lora_config is None: 28 | lora_config = LoraConfig( 29 | r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" 30 | ) 31 | 32 | llm = AutoModelForCausalLM.from_pretrained( 33 | model_name_or_path, 34 | torch_dtype=torch.float16, 35 | trust_remote_code=True, 36 | quantization_config=quantization_config, 37 | ) 38 | 39 | model = SupervisedFinetune(llm, lora=lora_config) 40 | 41 | if return_tokenizer: 42 | tokenizer = AutoTokenizer.from_pretrained( 43 | model_name_or_path, trust_remote_code=True, encode_special_tokens=True 44 | ) 45 | return model.llm, tokenizer 46 | else: 47 | return model.llm 48 | 49 | 50 | def build_lora_model(model_name_or_path, lora_config=None, return_tokenizer=True): 51 | if lora_config is None: 52 | lora_config = LoraConfig( 53 | r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" 54 | ) 55 | 56 | llm = AutoModelForCausalLM.from_pretrained( 57 | model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True 58 | ) 59 | 60 | model = SupervisedFinetune(llm, lora=lora_config) 61 | 62 | if return_tokenizer: 63 | tokenizer = AutoTokenizer.from_pretrained( 64 | model_name_or_path, trust_remote_code=True, encode_special_tokens=True 65 | ) 66 | return model.llm, tokenizer 67 | else: 68 | return model.llm 69 | 70 | 71 | def build_model(model_name_or_path, return_tokenizer=True): 72 | model = AutoModelForCausalLM.from_pretrained( 73 | model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True 74 | ) 75 | 76 | if return_tokenizer: 77 | tokenizer = AutoTokenizer.from_pretrained( 78 | model_name_or_path, trust_remote_code=True, encode_special_tokens=True 79 | ) 80 | return model, tokenizer 81 | else: 82 | return model 83 | -------------------------------------------------------------------------------- /xtuner/apis/training_args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from dataclasses import dataclass, field 3 | from typing import Union 4 | 5 | from transformers import TrainingArguments 6 | from transformers.trainer_utils import IntervalStrategy, SchedulerType 7 | 8 | __all__ = ["DefaultTrainingArguments"] 9 | 10 | 11 | @dataclass 12 | class DefaultTrainingArguments(TrainingArguments): 13 | # custom 14 | model_name_or_path: str = field( 15 | default=None, 16 | metadata={"help": "model name or path."}, 17 | ) 18 | dataset_name_or_path: str = field( 19 | default=None, 20 | metadata={"help": "dataset name or path."}, 21 | ) 22 | 23 | # huggingface 24 | default_output_dir = "./work_dirs" 25 | default_do_train = True 26 | default_per_device_train_batch_size = 1 27 | default_learning_rate = 2e-5 28 | default_save_strategy = "epoch" 29 | default_lr_scheduler_type = "cosine" 30 | default_logging_steps = 5 31 | 32 | output_dir: str = field( 33 | default=default_output_dir, 34 | metadata={ 35 | "help": ( 36 | "The output directory where the model predictions and " 37 | "checkpoints will be written." 38 | ) 39 | }, 40 | ) 41 | do_train: bool = field( 42 | default=default_do_train, metadata={"help": "Whether to run training."} 43 | ) 44 | per_device_train_batch_size: int = field( 45 | default=default_per_device_train_batch_size, 46 | metadata={"help": "Batch size per GPU/TPU core/CPU for training."}, 47 | ) 48 | learning_rate: float = field( 49 | default=default_learning_rate, 50 | metadata={"help": "The initial learning rate for AdamW."}, 51 | ) 52 | save_strategy: Union[IntervalStrategy, str] = field( 53 | default=default_save_strategy, 54 | metadata={"help": "The checkpoint save strategy to use."}, 55 | ) 56 | lr_scheduler_type: Union[SchedulerType, str] = field( 57 | default=default_lr_scheduler_type, 58 | metadata={"help": "The scheduler type to use."}, 59 | ) 60 | logging_steps: float = field( 61 | default=default_logging_steps, 62 | metadata={ 63 | "help": ( 64 | "Log every X updates steps. Should be an integer or a " 65 | "float in range `[0,1)`. If smaller than 1, will be " 66 | "interpreted as ratio of total training steps." 67 | ) 68 | }, 69 | ) 70 | -------------------------------------------------------------------------------- /xtuner/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | 5 | def get_cfgs_name_path(): 6 | path = os.path.dirname(__file__) 7 | mapping = {} 8 | for root, dirs, files in os.walk(path): 9 | for file_ in files: 10 | if ( 11 | file_.endswith((".py", ".json")) 12 | and not file_.startswith(".") 13 | and not file_.startswith("_") 14 | ): 15 | mapping[os.path.splitext(file_)[0]] = os.path.join(root, file_) 16 | return mapping 17 | 18 | 19 | cfgs_name_path = get_cfgs_name_path() 20 | 21 | __all__ = ["cfgs_name_path"] 22 | -------------------------------------------------------------------------------- /xtuner/configs/cohere/README.md: -------------------------------------------------------------------------------- 1 | # Cohere 8x7B 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Install the latest xtuner 7 | pip install -U 'xtuner[deepspeed]' 8 | 9 | # Cohere requires the latest version of transformers. 10 | pip install git+https://github.com/huggingface/transformers.git 11 | 12 | # Sequence parallel requires flash-attn 13 | pip install flash-attn 14 | ``` 15 | 16 | ## Full Parameter Fine-tune 17 | 18 | Full parameter fine-tune needs 64 A100-80G 19 | 20 | ### slurm 21 | 22 | Note: `$PARTITION` means the virtual partition of slurm. 23 | 24 | ```bash 25 | srun -p $PARTITION --job-name=Cohere --nodes=8 --gres=gpu:8 --ntasks-per-node=8 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 --launcher slurm 26 | ``` 27 | 28 | ### torchrun 29 | 30 | Note: `$NODE_0_ADDR` means the ip address of the node_0 machine. 31 | 32 | ```bash 33 | # excuete on node 0 34 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=0 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 35 | 36 | # excuete on node 1 37 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=1 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 38 | ``` 39 | 40 | ### Speed 41 | 42 | 16 * A100 80G: 43 | 44 | | Model | Sequence Length | GPUs Number | Sequence Parallel World Size | Tokens per Second | TFLOPs | 45 | | :---------: | :-------------: | :---------: | :--------------------------: | :---------------: | :----: | 46 | | Cohere_100b | 128k | 64 | 32 | 97.3 | 173.4 | 47 | | Cohere_100b | 128k | 128 | 16 | 102.1 | 182.7 | 48 | | Cohere_100b | 128k | 256 | 16 | 101.3 | 181.3 | 49 | -------------------------------------------------------------------------------- /xtuner/configs/deepseek/README.md: -------------------------------------------------------------------------------- 1 | # DeepSeek V2 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Git clone the latest xtuner 7 | git clone https://github.com/InternLM/xtuner.git 8 | 9 | # Install the latest xtuner 10 | cd xtuner 11 | pip install -e '.[all]' 12 | 13 | # Mixtral requires flash-attn 14 | pip install flash-attn 15 | 16 | # install the latest transformers 17 | pip install -U transformers 18 | ``` 19 | 20 | ## Full Parameter Fine-tune 21 | 22 | Full parameter fine-tune DeepSeek V2 236B needs at least 64 A100-80G. The full-tuned model will be saved to `${WORK_DIRS}/hf_model` by `HFCheckpointHook`. 23 | 24 | ### slurm 25 | 26 | Note: `$PARTITION` means the virtual partition of slurm. 27 | 28 | ```bash 29 | srun -p $PARTITION --job-name=mixtral --nodes=8 --gres=gpu:8 --ntasks-per-node=8 xtuner train deepseek_v2_chat_full_alpaca_e3 --deepspeed deepspeed_zero3 --launcher slurm 30 | ``` 31 | 32 | ### torchrun 33 | 34 | Note: `$NODE_0_ADDR` means the ip address of the node_0 machine. 35 | 36 | ```bash 37 | # excuete on node 0 38 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=0 xtuner train deepseek_v2_chat_full_alpaca_e3 --deepspeed deepspeed_zero3 --launcher pytorch 39 | 40 | # excuete on node 1 41 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=1 xtuner train deepseek_v2_chat_full_alpaca_e3 --deepspeed deepspeed_zero3 --launcher pytorch 42 | 43 | # excuete on node 2, 3, ..., 7 44 | ``` 45 | 46 | ### Speed 47 | 48 | 128 * A100 80G: 49 | 50 | | Model | Sequence Length | Use Varlen Attn | Sequence Parallel World Size | Tokens per Second | 51 | | :--------------------: | :-------------: | :-------------: | :--------------------------: | :---------------: | 52 | | deepseek v2 hf | 8k | False | 1 | 60 | 53 | | **deepseek v2 XTuner** | **8k** | **False** | **1** | **120 (2x)** | 54 | | deepseek v2 hf | 8k | True | 1 | 60 | 55 | | **deepseek v2 XTuner** | **8k** | **True** | **1** | **130 (2.2x)** | 56 | | deepseek v2 hf | 16k | False | 1 | OOM | 57 | | **deepseek v2 XTuner** | **16k** | **False** | **1** | **148** | 58 | | deepseek v2 hf | 16k | True | 1 | 95 | 59 | | **deepseek v2 XTuner** | **16k** | **True** | **1** | **180 (1.9x)** | 60 | -------------------------------------------------------------------------------- /xtuner/configs/deepspeed/deepspeed_zero1.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 1, 9 | "overlap_comm": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "initial_scale_power": 16 14 | }, 15 | "bf16": { 16 | "enabled": "auto" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /xtuner/configs/deepspeed/deepspeed_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 2, 9 | "overlap_comm": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "initial_scale_power": 16 14 | }, 15 | "bf16": { 16 | "enabled": "auto" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /xtuner/configs/deepspeed/deepspeed_zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 2, 9 | "overlap_comm": true, 10 | "offload_optimizer": { 11 | "device": "cpu", 12 | "pin_memory": true 13 | } 14 | }, 15 | "fp16": { 16 | "enabled": "auto", 17 | "initial_scale_power": 16 18 | }, 19 | "bf16": { 20 | "enabled": "auto" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /xtuner/configs/deepspeed/deepspeed_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 3, 9 | "overlap_comm": true, 10 | "stage3_gather_16bit_weights_on_model_save": true 11 | }, 12 | "fp16": { 13 | "enabled": "auto", 14 | "initial_scale_power": 16 15 | }, 16 | "bf16": { 17 | "enabled": "auto" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /xtuner/configs/deepspeed/deepspeed_zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 3, 9 | "overlap_comm": true, 10 | "offload_optimizer": { 11 | "device": "cpu", 12 | "pin_memory": true 13 | }, 14 | "offload_param": { 15 | "device": "cpu", 16 | "pin_memory": true 17 | }, 18 | "stage3_gather_16bit_weights_on_model_save": true 19 | }, 20 | "fp16": { 21 | "enabled": "auto", 22 | "initial_scale_power": 16 23 | }, 24 | "bf16": { 25 | "enabled": "auto" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_oasst1_e3_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_dataset 4 | from peft import LoraConfig 5 | from transformers import ( 6 | AutoModelForCausalLM, 7 | AutoTokenizer, 8 | BitsAndBytesConfig, 9 | Trainer, 10 | TrainingArguments, 11 | ) 12 | 13 | from xtuner.dataset import process_hf_dataset 14 | from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory 15 | from xtuner.utils import PROMPT_TEMPLATE 16 | 17 | framework = "huggingface" 18 | pretrained_model_name_or_path = "internlm/internlm-7b" 19 | dataset_name_or_path = "timdettmers/openassistant-guanaco" 20 | max_length = 2048 21 | pack_to_max_length = True 22 | prompt_template = PROMPT_TEMPLATE.default 23 | 24 | trainer = Trainer 25 | 26 | training_args = dict( 27 | type=TrainingArguments, 28 | do_train=True, 29 | learning_rate=2e-4, 30 | weight_decay=0, 31 | lr_scheduler_type="cosine", 32 | warmup_steps=100, 33 | per_device_train_batch_size=1, 34 | gradient_accumulation_steps=16, 35 | num_train_epochs=3, 36 | fp16=True, 37 | logging_steps=1, 38 | optim="paged_adamw_32bit", 39 | save_strategy="steps", 40 | save_steps=1000, 41 | save_total_limit=2, 42 | ddp_find_unused_parameters=False, 43 | ) 44 | 45 | tokenizer = dict( 46 | type=AutoTokenizer.from_pretrained, 47 | pretrained_model_name_or_path=pretrained_model_name_or_path, 48 | trust_remote_code=True, 49 | padding_side="right", 50 | ) 51 | 52 | model = dict( 53 | type=AutoModelForCausalLM.from_pretrained, 54 | pretrained_model_name_or_path=pretrained_model_name_or_path, 55 | trust_remote_code=True, 56 | torch_dtype=torch.float16, 57 | quantization_config=dict( 58 | type=BitsAndBytesConfig, 59 | load_in_4bit=True, 60 | load_in_8bit=False, 61 | llm_int8_threshold=6.0, 62 | llm_int8_has_fp16_weight=False, 63 | bnb_4bit_compute_dtype=torch.float16, 64 | bnb_4bit_use_double_quant=True, 65 | bnb_4bit_quant_type="nf4", 66 | ), 67 | ) 68 | 69 | lora = dict( 70 | type=LoraConfig, 71 | r=64, 72 | lora_alpha=16, 73 | lora_dropout=0.1, 74 | bias="none", 75 | task_type="CAUSAL_LM", 76 | ) 77 | 78 | train_dataset = dict( 79 | type=process_hf_dataset, 80 | dataset=dict(type=load_dataset, path=dataset_name_or_path), 81 | tokenizer=tokenizer, 82 | max_length=max_length, 83 | dataset_map_fn=oasst1_map_fn, 84 | template_map_fn=dict(type=template_map_fn_factory, template=prompt_template), 85 | remove_unused_columns=True, 86 | shuffle_before_pack=True, 87 | pack_to_max_length=pack_to_max_length, 88 | ) 89 | -------------------------------------------------------------------------------- /xtuner/configs/internvl/v1_5/convert_to_official.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os.path as osp 4 | 5 | import torch 6 | from mmengine.config import Config 7 | from transformers import AutoTokenizer 8 | 9 | from xtuner.model.utils import LoadWoInit 10 | from xtuner.registry import BUILDER 11 | 12 | 13 | def convert_to_official(config, trained_path, save_path): 14 | cfg = Config.fromfile(config) 15 | cfg.model.pretrained_pth = trained_path 16 | cfg.model.quantization_vit = False 17 | cfg.model.quantization_llm = False 18 | 19 | with LoadWoInit(): 20 | model = BUILDER.build(cfg.model) 21 | model.to(torch.bfloat16) 22 | 23 | if model.use_visual_encoder_lora: 24 | vision_model = model.model.vision_model.merge_and_unload() 25 | model.model.vision_model = vision_model 26 | 27 | if model.use_llm_lora: 28 | language_model = model.model.language_model.merge_and_unload() 29 | model.model.language_model = language_model 30 | 31 | model.model.save_pretrained(save_path) 32 | 33 | tokenizer = AutoTokenizer.from_pretrained( 34 | cfg.model.model_path, trust_remote_code=True 35 | ) 36 | tokenizer.save_pretrained(save_path) 37 | 38 | print(model) 39 | 40 | 41 | def main(): 42 | parser = argparse.ArgumentParser( 43 | description="Convert the pth model to HuggingFace model" 44 | ) 45 | parser.add_argument("config", help="config file name or path.") 46 | parser.add_argument("trained_model_pth", help="The trained model path.") 47 | parser.add_argument("save_path", help="The path to save the converted model.") 48 | args = parser.parse_args() 49 | 50 | if osp.realpath(args.trained_model_pth) == osp.realpath(args.save_path): 51 | raise ValueError("The trained path and save path should not be the same.") 52 | 53 | convert_to_official(args.config, args.trained_model_pth, args.save_path) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /xtuner/configs/llama/llama2_70b/llama2_70b_int8_lora_open_platypus_e1_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_dataset 4 | from peft import LoraConfig 5 | from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments 6 | 7 | from xtuner.dataset import process_hf_dataset 8 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | framework = "huggingface" 12 | pretrained_model_name_or_path = "meta-llama/Llama-2-70b-hf" 13 | dataset_name_or_path = "garage-bAInd/Open-Platypus" 14 | max_length = 2048 15 | pack_to_max_length = True 16 | prompt_template = PROMPT_TEMPLATE.llama2_chat 17 | 18 | trainer = Trainer 19 | 20 | training_args = dict( 21 | type=TrainingArguments, 22 | do_train=True, 23 | learning_rate=3e-4, 24 | weight_decay=0, 25 | lr_scheduler_type="cosine", 26 | warmup_steps=100, 27 | per_device_train_batch_size=1, 28 | gradient_accumulation_steps=16, 29 | num_train_epochs=1, 30 | fp16=True, 31 | logging_steps=1, 32 | optim="adamw_torch", 33 | save_strategy="steps", 34 | save_steps=1000, 35 | save_total_limit=2, 36 | ddp_find_unused_parameters=False, 37 | ) 38 | 39 | tokenizer = dict( 40 | type=AutoTokenizer.from_pretrained, 41 | pretrained_model_name_or_path=pretrained_model_name_or_path, 42 | trust_remote_code=True, 43 | padding_side="right", 44 | ) 45 | 46 | model = dict( 47 | type=AutoModelForCausalLM.from_pretrained, 48 | pretrained_model_name_or_path=pretrained_model_name_or_path, 49 | trust_remote_code=True, 50 | torch_dtype=torch.float16, 51 | load_in_8bit=True, 52 | ) 53 | 54 | lora = dict( 55 | type=LoraConfig, 56 | r=16, 57 | lora_alpha=16, 58 | lora_dropout=0.05, 59 | target_modules=["gate_proj", "down_proj", "up_proj"], 60 | bias="none", 61 | task_type="CAUSAL_LM", 62 | ) 63 | 64 | train_dataset = dict( 65 | type=process_hf_dataset, 66 | dataset=dict(type=load_dataset, path=dataset_name_or_path), 67 | tokenizer=tokenizer, 68 | max_length=max_length, 69 | dataset_map_fn=alpaca_map_fn, 70 | template_map_fn=dict(type=template_map_fn_factory, template=prompt_template), 71 | remove_unused_columns=True, 72 | shuffle_before_pack=True, 73 | pack_to_max_length=pack_to_max_length, 74 | ) 75 | -------------------------------------------------------------------------------- /xtuner/configs/llama/llama2_70b/llama2_70b_qlora_open_platypus_e1_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_dataset 4 | from peft import LoraConfig 5 | from transformers import ( 6 | AutoModelForCausalLM, 7 | AutoTokenizer, 8 | BitsAndBytesConfig, 9 | Trainer, 10 | TrainingArguments, 11 | ) 12 | 13 | from xtuner.dataset import process_hf_dataset 14 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 15 | from xtuner.utils import PROMPT_TEMPLATE 16 | 17 | framework = "huggingface" 18 | pretrained_model_name_or_path = "meta-llama/Llama-2-70b-hf" 19 | dataset_name_or_path = "garage-bAInd/Open-Platypus" 20 | max_length = 2048 21 | pack_to_max_length = True 22 | prompt_template = PROMPT_TEMPLATE.llama2_chat 23 | 24 | trainer = Trainer 25 | 26 | training_args = dict( 27 | type=TrainingArguments, 28 | do_train=True, 29 | learning_rate=3e-4, 30 | weight_decay=0, 31 | lr_scheduler_type="cosine", 32 | warmup_steps=100, 33 | per_device_train_batch_size=1, 34 | gradient_accumulation_steps=16, 35 | num_train_epochs=1, 36 | fp16=True, 37 | logging_steps=1, 38 | optim="adamw_torch", 39 | save_strategy="steps", 40 | save_steps=1000, 41 | save_total_limit=2, 42 | ddp_find_unused_parameters=False, 43 | ) 44 | 45 | tokenizer = dict( 46 | type=AutoTokenizer.from_pretrained, 47 | pretrained_model_name_or_path=pretrained_model_name_or_path, 48 | trust_remote_code=True, 49 | padding_side="right", 50 | ) 51 | 52 | model = dict( 53 | type=AutoModelForCausalLM.from_pretrained, 54 | pretrained_model_name_or_path=pretrained_model_name_or_path, 55 | trust_remote_code=True, 56 | torch_dtype=torch.float16, 57 | quantization_config=dict( 58 | type=BitsAndBytesConfig, 59 | load_in_4bit=True, 60 | load_in_8bit=False, 61 | llm_int8_threshold=6.0, 62 | llm_int8_has_fp16_weight=False, 63 | bnb_4bit_compute_dtype=torch.float16, 64 | bnb_4bit_use_double_quant=True, 65 | bnb_4bit_quant_type="nf4", 66 | ), 67 | ) 68 | 69 | lora = dict( 70 | type=LoraConfig, 71 | r=64, 72 | lora_alpha=16, 73 | lora_dropout=0.1, 74 | target_modules=["gate_proj", "down_proj", "up_proj"], 75 | bias="none", 76 | task_type="CAUSAL_LM", 77 | ) 78 | 79 | train_dataset = dict( 80 | type=process_hf_dataset, 81 | dataset=dict(type=load_dataset, path=dataset_name_or_path), 82 | tokenizer=tokenizer, 83 | max_length=max_length, 84 | dataset_map_fn=alpaca_map_fn, 85 | template_map_fn=dict(type=template_map_fn_factory, template=prompt_template), 86 | remove_unused_columns=True, 87 | shuffle_before_pack=True, 88 | pack_to_max_length=pack_to_max_length, 89 | ) 90 | -------------------------------------------------------------------------------- /xtuner/configs/llama/llama3_8b/README.md: -------------------------------------------------------------------------------- 1 | # Llama3 8B 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Install the latest xtuner 7 | pip install -U 'xtuner[deepspeed]' 8 | 9 | # install the latest transformers 10 | pip install -U transformers 11 | ``` 12 | 13 | ## QLoRA Fine-tune 14 | 15 | QLoRA only need a single A100-80G 16 | 17 | ```bash 18 | xtuner train llama3_8b_instruct_qlora_alpaca_e3 19 | ``` 20 | 21 | ## Full Parameter Fine-tune 22 | 23 | Full parameter fine-tune Llama3 8B in 8k context only requires 2 * A100-80G 24 | 25 | ### torchrun 26 | 27 | ```bash 28 | NPROC_PER_NODE=${GPU_NUM} xtuner train llama3_8b_instruct_full_alpaca_e3 --deepspeed deepspeed_zero2 29 | ``` 30 | 31 | ### slurm 32 | 33 | ```bash 34 | srun ${SRUN_ARGS} xtuner train llama3_8b_instruct_full_alpaca_e3 --launcher slurm --deepspeed deepspeed_zero3 35 | ``` 36 | 37 | ### Speed 38 | 39 | | Model | Sequence Length | GPU Number | ZeRO | Sequence Parallel | Tokens per Second | TFLOPs | 40 | | :-------: | :-------------: | :--------: | :----: | :---------------: | :---------------: | :----: | 41 | | Llama3 8B | 8k | 2 | ZeRO-3 | 2 | 1037.0 | 76.8 | 42 | | Llama3 8B | 8k | 4 | ZeRO-3 | 1 | 2331.3 | 172.6 | 43 | | Llama3 8B | 8k | 8 | ZeRO-3 | 1 | 2771.2 | 205.1 | 44 | 45 | | Model | Sequence Length | GPU Number | ZeRO | Sequence Parallel | Tokens per Second | TFLOPs | 46 | | :-------: | :-------------: | :--------: | :----: | :---------------: | :---------------: | :----: | 47 | | Llama3 8B | 8k | 8 | ZeRO-3 | 1 | 2771.2 | 205.1 | 48 | | Llama3 8B | 16k | 8 | ZeRO-3 | 2 | 2320.7 | 191.7 | 49 | | Llama3 8B | 32k | 8 | ZeRO-3 | 4 | 1870.2 | 186.6 | 50 | | Llama3 8B | 64k | 8 | ZeRO-3 | 8 | 1356.4 | 182.0 | 51 | | Llama3 8B | 128k | 8 | ZeRO-3 | 8 | 875.7 | 177.7 | 52 | -------------------------------------------------------------------------------- /xtuner/configs/mixtral/README.md: -------------------------------------------------------------------------------- 1 | # Mixtral 8x7B 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Install the latest xtuner 7 | pip install -U 'xtuner[deepspeed]' 8 | 9 | # Mixtral requires flash-attn 10 | pip install flash-attn 11 | 12 | # install the latest transformers 13 | pip install -U transformers 14 | ``` 15 | 16 | ## QLoRA Fine-tune 17 | 18 | QLoRA only need a single A100-80G 19 | 20 | ```bash 21 | xtuner train mixtral_8x7b_instruct_qlora_oasst1_e3 --deepspeed deepspeed_zero2 22 | ``` 23 | 24 | ## Full Parameter Fine-tune 25 | 26 | Full parameter fine-tune needs 16 A100-80G 27 | 28 | ### slurm 29 | 30 | Note: `$PARTITION` means the virtual partition of slurm. 31 | 32 | ```bash 33 | srun -p $PARTITION --job-name=mixtral --nodes=2 --gres=gpu:8 --ntasks-per-node=8 xtuner train mixtral_8x7b_instruct_full_oasst1_e3 --deepspeed deepspeed_zero3 --launcher slurm 34 | ``` 35 | 36 | ### torchrun 37 | 38 | Note: `$NODE_0_ADDR` means the ip address of the node_0 machine. 39 | 40 | ```bash 41 | # excuete on node 0 42 | NPROC_PER_NODE=8 NNODES=2 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=0 xtuner train mixtral_8x7b_instruct_full_oasst1_e3 --deepspeed deepspeed_zero3 43 | 44 | # excuete on node 1 45 | NPROC_PER_NODE=8 NNODES=2 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=1 xtuner train mixtral_8x7b_instruct_full_oasst1_e3 --deepspeed deepspeed_zero3 46 | ``` 47 | 48 | ### Speed 49 | 50 | 16 * A100 80G: 51 | 52 | | Model | Sequence Length | Use Varlen Attn | Sequence Parallel World Size | Tokens per Second | 53 | | :----------: | :-------------: | :-------------: | :--------------------------: | :---------------: | 54 | | mixtral_8x7b | 32k | False | 1 | 853.7 | 55 | | mixtral_8x7b | 32k | True | 1 | 910.1 | 56 | | mixtral_8x7b | 32k | False | 2 | 635.2 | 57 | | mixtral_8x7b | 32k | True | 2 | 650.9 | 58 | -------------------------------------------------------------------------------- /xtuner/configs/qwen/qwen1_5/qwen1_5_110b_chat/README.md: -------------------------------------------------------------------------------- 1 | # Qwen 110B 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Install the latest xtuner 7 | pip install -U 'xtuner[deepspeed]' 8 | 9 | # We recommend installing flash_attn 10 | # pip install flash-attn 11 | 12 | # install the latest transformers 13 | pip install -U transformers 14 | ``` 15 | 16 | ## QLoRA Fine-tune 17 | 18 | Training Qwen 110B with 32k context capability requires only 2 * A100 80G. 19 | 20 | ```bash 21 | xtuner train xtuner/configs/qwen/qwen1_5/qwen1_5_110b_chat/qwen1_5_110b_chat_qlora_alpaca_e3_16k_2gpus.py --deepspeed deepspeed_zero3 22 | ``` 23 | 24 |
25 | 26 |
27 | -------------------------------------------------------------------------------- /xtuner/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings 3 | 4 | from .concat_dataset import ConcatDataset 5 | from .huggingface import process_hf_dataset 6 | from .intern_repo import ( 7 | build_packed_dataset, 8 | load_intern_repo_tokenized_dataset, 9 | load_intern_repo_untokenized_dataset, 10 | ) 11 | from .internvl_dataset import InternVL_V1_5_Dataset 12 | from .json_dataset import load_json_file 13 | from .llava import LLaVADataset 14 | from .modelscope import process_ms_dataset 15 | from .moss_sft import MOSSSFTDataset 16 | from .refcoco_json import ( 17 | InvRefCOCOJsonDataset, 18 | RefCOCOJsonDataset, 19 | RefCOCOJsonEvalDataset, 20 | ) 21 | from .utils import decode_base64_to_image, expand2square, load_image 22 | 23 | # ignore FutureWarning in hf datasets 24 | warnings.simplefilter(action="ignore", category=FutureWarning) 25 | 26 | __all__ = [ 27 | "process_hf_dataset", 28 | "ConcatDataset", 29 | "MOSSSFTDataset", 30 | "process_ms_dataset", 31 | "LLaVADataset", 32 | "expand2square", 33 | "decode_base64_to_image", 34 | "load_image", 35 | "load_intern_repo_tokenized_dataset", 36 | "load_intern_repo_untokenized_dataset", 37 | "build_packed_dataset", 38 | "RefCOCOJsonDataset", 39 | "RefCOCOJsonEvalDataset", 40 | "InvRefCOCOJsonDataset", 41 | "load_json_file", 42 | "InternVL_V1_5_Dataset", 43 | ] 44 | -------------------------------------------------------------------------------- /xtuner/dataset/collate_fns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .default_collate_fn import default_collate_fn 3 | from .mmlu_collate_fn import mmlu_collate_fn 4 | 5 | __all__ = ["default_collate_fn", "mmlu_collate_fn"] 6 | -------------------------------------------------------------------------------- /xtuner/dataset/collate_fns/mmlu_collate_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Dict, Sequence 3 | 4 | import torch 5 | from torch.nn.utils.rnn import pad_sequence 6 | 7 | from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX 8 | 9 | 10 | def mmlu_collate_fn( 11 | instances: Sequence[Dict], 12 | pad_index: int = DEFAULT_PAD_TOKEN_INDEX, 13 | return_hf_format: bool = False, 14 | ) -> Dict[str, torch.Tensor]: 15 | input_ids = [] 16 | labels = [] 17 | data_samples = {"labels": [], "subjects": []} 18 | for example in instances: 19 | input_ids.append(torch.tensor(example["input_ids"])) 20 | labels.append(torch.tensor(example["labels"])) 21 | data_samples["labels"].append(example["output"]) 22 | data_samples["subjects"].append(example["subject"]) 23 | if len(instances) > 1: 24 | input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_index) 25 | labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) 26 | else: 27 | input_ids = torch.stack(input_ids) 28 | labels = torch.stack(labels) 29 | 30 | data_dict = { 31 | "input_ids": input_ids, 32 | "attention_mask": input_ids.ne(pad_index), 33 | "labels": labels, 34 | } 35 | 36 | if return_hf_format: 37 | return data_dict 38 | else: 39 | return {"data": data_dict, "data_samples": data_samples} 40 | -------------------------------------------------------------------------------- /xtuner/dataset/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch.utils.data import ConcatDataset as TorchConcatDataset 3 | 4 | from xtuner.registry import BUILDER 5 | 6 | 7 | class ConcatDataset(TorchConcatDataset): 8 | def __init__(self, datasets): 9 | datasets_instance = [] 10 | for cfg in datasets: 11 | datasets_instance.append(BUILDER.build(cfg)) 12 | super().__init__(datasets=datasets_instance) 13 | 14 | def __repr__(self): 15 | main_str = "Dataset as a concatenation of multiple datasets. \n" 16 | main_str += ",\n".join([f"{repr(dataset)}" for dataset in self.datasets]) 17 | return main_str 18 | -------------------------------------------------------------------------------- /xtuner/dataset/json_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import json 3 | import os 4 | 5 | from datasets import Dataset, concatenate_datasets 6 | 7 | 8 | def load_json_file(data_files=None, data_dir=None, suffix=None): 9 | assert (data_files is not None) != (data_dir is not None) 10 | if data_dir is not None: 11 | data_files = os.listdir(data_dir) 12 | data_files = [os.path.join(data_dir, fn) for fn in data_files] 13 | if suffix is not None: 14 | data_files = [fp for fp in data_files if fp.endswith(suffix)] 15 | elif isinstance(data_files, str): 16 | data_files = [data_files] 17 | 18 | dataset_list = [] 19 | for fp in data_files: 20 | with open(fp, encoding="utf-8") as file: 21 | data = json.load(file) 22 | ds = Dataset.from_list(data) 23 | dataset_list.append(ds) 24 | dataset = concatenate_datasets(dataset_list) 25 | return dataset 26 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dataset_map_fns import * # noqa: F401, F403 3 | from .template_map_fn import template_map_fn # noqa: F401 4 | from .template_map_fn import template_map_fn_factory # noqa: F401 5 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .alpaca_map_fn import alpaca_map_fn 3 | from .alpaca_zh_map_fn import alpaca_zh_map_fn 4 | from .arxiv_map_fn import arxiv_map_fn 5 | from .code_alpaca_map_fn import code_alpaca_map_fn 6 | from .colors_map_fn import colors_map_fn 7 | from .crime_kg_assitant_map_fn import crime_kg_assitant_map_fn 8 | from .default_map_fn import default_map_fn 9 | from .law_reference_map_fn import law_reference_map_fn 10 | from .llava_map_fn import llava_image_only_map_fn, llava_map_fn 11 | from .medical_map_fn import medical_map_fn 12 | from .msagent_map_fn import msagent_react_map_fn 13 | from .oasst1_map_fn import oasst1_map_fn 14 | from .openai_map_fn import openai_map_fn 15 | from .openorca_map_fn import openorca_map_fn 16 | from .pretrain_map_fn import pretrain_map_fn 17 | from .sql_map_fn import sql_map_fn 18 | from .stack_exchange_map_fn import stack_exchange_map_fn 19 | from .tiny_codes_map_fn import tiny_codes_map_fn 20 | from .wizardlm_map_fn import wizardlm_map_fn 21 | 22 | DATASET_FORMAT_MAPPING = dict( 23 | alpaca=alpaca_map_fn, 24 | alpaca_zh=alpaca_zh_map_fn, 25 | arxiv=arxiv_map_fn, 26 | code_alpaca=code_alpaca_map_fn, 27 | colors=colors_map_fn, 28 | crime_kg_assitan=crime_kg_assitant_map_fn, 29 | default=default_map_fn, 30 | law_reference=law_reference_map_fn, 31 | llava_image_only=llava_image_only_map_fn, 32 | llava=llava_map_fn, 33 | medical=medical_map_fn, 34 | msagent_react=msagent_react_map_fn, 35 | oasst1=oasst1_map_fn, 36 | openai=openai_map_fn, 37 | openorca=openorca_map_fn, 38 | pretrain=pretrain_map_fn, 39 | sql=sql_map_fn, 40 | stack_exchange=stack_exchange_map_fn, 41 | tiny_codes=tiny_codes_map_fn, 42 | wizardlm=wizardlm_map_fn, 43 | ) 44 | 45 | __all__ = [ 46 | "alpaca_map_fn", 47 | "alpaca_zh_map_fn", 48 | "oasst1_map_fn", 49 | "arxiv_map_fn", 50 | "medical_map_fn", 51 | "openorca_map_fn", 52 | "code_alpaca_map_fn", 53 | "tiny_codes_map_fn", 54 | "colors_map_fn", 55 | "law_reference_map_fn", 56 | "crime_kg_assitant_map_fn", 57 | "sql_map_fn", 58 | "openai_map_fn", 59 | "wizardlm_map_fn", 60 | "stack_exchange_map_fn", 61 | "msagent_react_map_fn", 62 | "pretrain_map_fn", 63 | "default_map_fn", 64 | "llava_image_only_map_fn", 65 | "llava_map_fn", 66 | "DATASET_FORMAT_MAPPING", 67 | ] 68 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/alpaca_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | 4 | def alpaca_map_fn(example): 5 | if example.get("output") == "": 6 | return {"conversation": []} 7 | else: 8 | return { 9 | "conversation": [ 10 | { 11 | "input": f"{example['instruction']}\n{example['input']}", 12 | "output": example["output"], 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/alpaca_zh_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | 4 | def alpaca_zh_map_fn(example): 5 | return { 6 | "conversation": [ 7 | { 8 | "input": f"{example['instruction_zh']}\n{example['input_zh']}", 9 | "output": example["output_zh"], 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/arxiv_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def arxiv_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.arxiv_gentile, 10 | "input": example["abstract"], 11 | "output": example["title"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/code_alpaca_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def code_alpaca_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.coder, 10 | "input": example["prompt"], 11 | "output": example["completion"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/colors_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def colors_map_fn(example): 6 | desc = ":".join(example["description"].split(":")[1:]).strip() 7 | return { 8 | "conversation": [ 9 | { 10 | "system": SYSTEM_TEMPLATE.colorist, 11 | "input": desc, 12 | "output": example["color"], 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/crime_kg_assitant_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def crime_kg_assitant_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.lawyer, 10 | "input": example["input"], 11 | "output": example["output"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/default_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def default_map_fn(example): 3 | return {"conversation": [{"input": example["input"], "output": example["output"]}]} 4 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/law_reference_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def law_reference_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.lawyer, 10 | "input": example["question"], 11 | "output": example["answer"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import DEFAULT_IMAGE_TOKEN 3 | 4 | 5 | def llava_image_only_map_fn(example): 6 | # input contains the DEFAULT_IMAGE_TOKEN only 7 | messages = example["conversations"] 8 | input = "" 9 | conversation = [] 10 | while messages and messages[0]["from"] == "gpt": 11 | # Skip the first one if it is from gpt 12 | messages = messages[1:] 13 | for msg in messages: 14 | if msg["from"] == "human": 15 | assert DEFAULT_IMAGE_TOKEN in msg["value"] 16 | input += DEFAULT_IMAGE_TOKEN 17 | elif msg["from"] == "gpt": 18 | conversation.append({"input": input, "output": msg["value"]}) 19 | input = "" 20 | else: 21 | raise NotImplementedError 22 | return {"conversation": conversation} 23 | 24 | 25 | def llava_map_fn(example): 26 | messages = example["conversations"] 27 | input = "" 28 | conversation = [] 29 | while messages and messages[0]["from"] == "gpt": 30 | # Skip the first one if it is from gpt 31 | messages = messages[1:] 32 | for msg in messages: 33 | if msg["from"] == "human": 34 | if DEFAULT_IMAGE_TOKEN in msg["value"]: 35 | msg["value"] = msg["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip() 36 | msg["value"] = DEFAULT_IMAGE_TOKEN + "\n" + msg["value"] 37 | msg["value"] = msg["value"].strip() 38 | input += msg["value"] 39 | 40 | elif msg["from"] == "gpt": 41 | conversation.append({"input": input, "output": msg["value"]}) 42 | input = "" 43 | else: 44 | raise NotImplementedError 45 | return {"conversation": conversation} 46 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/medical_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def medical_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.medical, 10 | "input": "{instruction}\n{input}".format(**example), 11 | "output": example["output"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/oasst1_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def oasst1_map_fn(example): 3 | r"""Example before preprocessing: 4 | example['text'] = '### Human: Can you explain xxx' 5 | '### Assistant: Sure! xxx' 6 | '### Human: I didn't understand how xxx' 7 | '### Assistant: It has to do with a process xxx.' 8 | 9 | Example after preprocessing: 10 | example['conversation'] = [ 11 | { 12 | 'input': 'Can you explain xxx', 13 | 'output': 'Sure! xxx' 14 | }, 15 | { 16 | 'input': 'I didn't understand how xxx', 17 | 'output': 'It has to do with a process xxx.' 18 | } 19 | ] 20 | """ 21 | data = [] 22 | for sentence in example["text"].strip().split("###"): 23 | sentence = sentence.strip() 24 | if sentence[:6] == "Human:": 25 | data.append(sentence[6:].strip()) 26 | elif sentence[:10] == "Assistant:": 27 | data.append(sentence[10:].strip()) 28 | if len(data) % 2: 29 | # The last round of conversation solely consists of input 30 | # without any output. 31 | # Discard the input part of the last round, as this part is ignored in 32 | # the loss calculation. 33 | data.pop() 34 | conversation = [] 35 | for i in range(0, len(data), 2): 36 | single_turn_conversation = {"input": data[i], "output": data[i + 1]} 37 | conversation.append(single_turn_conversation) 38 | return {"conversation": conversation} 39 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/openai_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def openai_map_fn(example): 3 | """ 4 | Example before preprocessing: 5 | example["messages"] = [ 6 | { "role": "system", "content": "You are an assistant that 7 | occasionally misspells words." }, 8 | { "role": "user", "content": "Tell me a story." }, 9 | { "role": "assistant", "content": "One day a student 10 | went to schoool." } 11 | ] 12 | Example after preprocessing: 13 | example["conversation"] = [ 14 | { 15 | "system": "You are an assistant that occasionally misspells 16 | words.", 17 | "input": "Tell me a story.", 18 | "output": "One day a student went to schoool." 19 | } 20 | ] 21 | """ 22 | messages = example["messages"] 23 | system = "" 24 | input = "" 25 | conversation = [] 26 | while messages and messages[0]["role"] == "assistant": 27 | # Skip the first one if it is from assistant 28 | messages = messages[1:] 29 | for msg in messages: 30 | if msg["role"] == "system": 31 | system = msg["content"] 32 | elif msg["role"] == "user": 33 | input += msg["content"] 34 | elif msg["role"] == "assistant": 35 | output_with_loss = msg.get("loss", "True") 36 | output_with_loss = str(output_with_loss) 37 | output_with_loss = output_with_loss.lower() == "true" 38 | conversation.append( 39 | { 40 | "system": system, 41 | "input": input, 42 | "output": msg["content"], 43 | "output_with_loss": output_with_loss, 44 | } 45 | ) 46 | system = "" 47 | input = "" 48 | else: 49 | raise NotImplementedError 50 | return {"conversation": conversation} 51 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/openorca_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def openorca_map_fn(example): 3 | return { 4 | "conversation": [ 5 | { 6 | "system": example["system_prompt"], 7 | "input": example["question"], 8 | "output": example["response"], 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/pretrain_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def pretrain_map_fn(example): 3 | r"""Example before preprocessing: 4 | example['text'] = 'xxx' 5 | 6 | Example after preprocessing: 7 | example['conversation'] = [ 8 | { 9 | 'input': '', 10 | 'output': 'xxx' 11 | }, 12 | ] 13 | """ 14 | return { 15 | "conversation": [ 16 | {"input": "", "output": example["text"].strip(), "need_eos_token": False} 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/sql_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def sql_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.sql, 10 | "input": "{context}\n{question}".format(**example), 11 | "output": example["answer"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/stack_exchange_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def stack_exchange_map_fn(example): 3 | return { 4 | "conversation": [{"input": example["question"], "output": example["response"]}] 5 | } 6 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/tiny_codes_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def tiny_codes_map_fn(example): 6 | return { 7 | "conversation": [ 8 | { 9 | "system": SYSTEM_TEMPLATE.coder, 10 | "input": example["prompt"], 11 | "output": example["response"], 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/dataset_map_fns/wizardlm_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def wizardlm_map_fn(example): 3 | messages = example["conversations"] 4 | input = "" 5 | conversation = [] 6 | while messages and messages[0]["from"] == "gpt": 7 | # Skip the first one if it is from gpt 8 | messages = messages[1:] 9 | for msg in messages: 10 | if msg["from"] == "human": 11 | input += msg["value"] 12 | elif msg["from"] == "gpt": 13 | conversation.append({"input": input, "output": msg["value"]}) 14 | input = "" 15 | else: 16 | raise NotImplementedError 17 | return {"conversation": conversation} 18 | -------------------------------------------------------------------------------- /xtuner/dataset/map_fns/template_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from mmengine.utils.misc import get_object_from_string 5 | 6 | 7 | def template_map_fn(example, template): 8 | conversation = example.get("conversation", []) 9 | for i, single_turn_conversation in enumerate(conversation): 10 | input = single_turn_conversation.get("input", "") 11 | if input is None: 12 | input = "" 13 | input_text = template.INSTRUCTION.format(input=input, round=i + 1) 14 | system = single_turn_conversation.get("system", "") 15 | if system != "" and system is not None: 16 | system = template.SYSTEM.format(system=system) 17 | input_text = system + input_text 18 | single_turn_conversation["input"] = input_text 19 | 20 | if template.get("SUFFIX", None): 21 | output_text = single_turn_conversation.get("output", "") 22 | output_text += template.SUFFIX 23 | single_turn_conversation["output"] = output_text 24 | 25 | # SUFFIX_AS_EOS is False ==> need_eos_token is True 26 | single_turn_conversation["need_eos_token"] = not template.get( 27 | "SUFFIX_AS_EOS", False 28 | ) 29 | single_turn_conversation["sep"] = template.get("SEP", "") 30 | 31 | return {"conversation": conversation} 32 | 33 | 34 | def template_map_fn_factory(template): 35 | if isinstance(template, str): # for resume 36 | template = get_object_from_string(template) 37 | return partial(template_map_fn, template=template) 38 | -------------------------------------------------------------------------------- /xtuner/dataset/modelscope.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.config import Config, ConfigDict 3 | 4 | from xtuner.registry import BUILDER 5 | 6 | from .huggingface import process_hf_dataset 7 | 8 | 9 | def process_ms_dataset(dataset, split="train", *args, **kwargs): 10 | """Post-process the dataset loaded from the ModelScope Hub.""" 11 | 12 | if isinstance(dataset, (Config, ConfigDict)): 13 | dataset = BUILDER.build(dataset) 14 | if isinstance(dataset, dict): 15 | dataset = dataset[split] 16 | dataset = dataset.to_hf_dataset() 17 | return process_hf_dataset(dataset, *args, **kwargs) 18 | -------------------------------------------------------------------------------- /xtuner/dataset/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .intern_repo import InternlmRepoSampler, InternRepoSampler 3 | from .length_grouped import LengthGroupedSampler 4 | 5 | __all__ = ["LengthGroupedSampler", "InternRepoSampler", "InternlmRepoSampler"] 6 | -------------------------------------------------------------------------------- /xtuner/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ._strategy import DeepSpeedStrategy 3 | from .hooks import ( 4 | DatasetInfoHook, 5 | EvaluateChatHook, 6 | ThroughputHook, 7 | VarlenAttnArgsToMessageHubHook, 8 | ) 9 | from .runner import TrainLoop 10 | 11 | __all__ = [ 12 | "EvaluateChatHook", 13 | "DatasetInfoHook", 14 | "ThroughputHook", 15 | "VarlenAttnArgsToMessageHubHook", 16 | "DeepSpeedStrategy", 17 | "TrainLoop", 18 | ] 19 | -------------------------------------------------------------------------------- /xtuner/engine/_strategy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .deepspeed import DeepSpeedStrategy 3 | 4 | __all__ = ["DeepSpeedStrategy"] 5 | -------------------------------------------------------------------------------- /xtuner/engine/_strategy/deepspeed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional 3 | 4 | from mmengine._strategy import DeepSpeedStrategy as MMEngineDeepSpeedStrategy 5 | 6 | from xtuner import DS_CEPH_DIR 7 | from xtuner.parallel.sequence import init_sequence_parallel 8 | from xtuner.utils.device import get_device 9 | from xtuner.utils.fileio import patch_fileio 10 | 11 | 12 | class DeepSpeedStrategy(MMEngineDeepSpeedStrategy): 13 | def __init__(self, *args, **kwargs): 14 | sequence_parallel_size = kwargs.pop("sequence_parallel_size", 1) 15 | self.sequence_parallel_size = sequence_parallel_size 16 | 17 | super().__init__(*args, **kwargs) 18 | 19 | from transformers.integrations.deepspeed import HfDeepSpeedConfig 20 | 21 | # hf_deepspeed_config has to be saved as an attribute. 22 | self.hf_deepspeed_config = HfDeepSpeedConfig(self.config) 23 | 24 | def _wrap_model(self, model): 25 | wrapper = super()._wrap_model(model) 26 | # hard code for deepspeed zero3 27 | # When utilizing Zero3, the model isn't allocated to CUDA within the 28 | # `deepspeed.initialize` process. 29 | assert hasattr(wrapper.model, "data_preprocessor") 30 | wrapper.model.data_preprocessor.to(get_device()) 31 | return wrapper 32 | 33 | def save_checkpoint(self, *args, **kwargs) -> None: 34 | if DS_CEPH_DIR: 35 | from os import path as osp 36 | 37 | work_dir_prefix = osp.split(self.work_dir)[0] 38 | 39 | filename = kwargs["filename"].replace(work_dir_prefix, DS_CEPH_DIR) 40 | kwargs["filename"] = filename 41 | with patch_fileio(): 42 | super().save_checkpoint(*args, **kwargs) 43 | else: 44 | super().save_checkpoint(*args, **kwargs) 45 | 46 | def load_checkpoint(self, *args, **kwargs) -> None: 47 | if DS_CEPH_DIR: 48 | with patch_fileio(): 49 | checkpoint = super().load_checkpoint(*args, **kwargs) 50 | else: 51 | checkpoint = super().load_checkpoint(*args, **kwargs) 52 | return checkpoint 53 | 54 | def resume(self, *args, **kwargs) -> None: 55 | if DS_CEPH_DIR: 56 | with patch_fileio(): 57 | checkpoint = super().resume(*args, **kwargs) 58 | else: 59 | checkpoint = super().resume(*args, **kwargs) 60 | return checkpoint 61 | 62 | def _setup_distributed( # type: ignore 63 | self, 64 | launcher: Optional[str] = None, 65 | backend: str = "nccl", 66 | **kwargs, 67 | ): 68 | super()._setup_distributed(launcher, backend, **kwargs) 69 | init_sequence_parallel(self.sequence_parallel_size) 70 | -------------------------------------------------------------------------------- /xtuner/engine/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dataset_info_hook import DatasetInfoHook 3 | from .evaluate_chat_hook import EvaluateChatHook 4 | from .hf_checkpoint_hook import HFCheckpointHook 5 | from .throughput_hook import ThroughputHook 6 | from .varlen_attn_args_to_messagehub_hook import VarlenAttnArgsToMessageHubHook 7 | 8 | __all__ = [ 9 | "EvaluateChatHook", 10 | "DatasetInfoHook", 11 | "ThroughputHook", 12 | "VarlenAttnArgsToMessageHubHook", 13 | "HFCheckpointHook", 14 | ] 15 | -------------------------------------------------------------------------------- /xtuner/engine/hooks/dataset_info_hook.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.hooks import Hook 3 | 4 | from xtuner.registry import BUILDER 5 | from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX 6 | 7 | 8 | def split_list(lst, value): 9 | res = [] 10 | tmp_res = [] 11 | for i in lst: 12 | if i == value: 13 | res.append(tmp_res) 14 | tmp_res = [] 15 | else: 16 | tmp_res.append(i) 17 | res.append(tmp_res) 18 | return res 19 | 20 | 21 | class DatasetInfoHook(Hook): 22 | def __init__(self, tokenizer, is_intern_repo_dataset=False): 23 | self.tokenizer = BUILDER.build(tokenizer) 24 | self.is_intern_repo_dataset = is_intern_repo_dataset 25 | 26 | def log(self, runner, dataset, mode="train"): 27 | def _log(input_ids, log_prefix=""): 28 | if self.is_intern_repo_dataset: 29 | input_ids = [abs(x) for x in input_ids] 30 | # Try to split list to be compatible with IMAGE token 31 | input_ids = split_list(input_ids, IMAGE_TOKEN_INDEX) 32 | text = log_prefix 33 | for idx, ids in enumerate(input_ids): 34 | text += self.tokenizer.decode(ids) 35 | if idx != len(input_ids) - 1: 36 | text += DEFAULT_IMAGE_TOKEN 37 | runner.logger.info(text) 38 | 39 | runner.logger.info(f"Num {mode} samples {len(dataset)}") 40 | runner.logger.info(f"{mode} example:") 41 | if "chosen_ids" in dataset[0]: 42 | _log(dataset[0]["chosen_ids"], log_prefix="chosen: ") 43 | _log(dataset[0]["rejected_ids"], log_prefix="rejected: ") 44 | else: 45 | _log(dataset[0]["input_ids"]) 46 | 47 | def before_train(self, runner) -> None: 48 | do_train = runner.train_loop is not None 49 | do_eval = runner.val_loop is not None 50 | if do_train: 51 | train_dataset = runner.train_dataloader.dataset 52 | self.log(runner, train_dataset, mode="train") 53 | if do_eval: 54 | eval_dataset = runner.val_dataloader.dataset 55 | self.log(runner, eval_dataset, mode="eval") 56 | 57 | def before_val(self, runner) -> None: 58 | eval_dataset = runner.val_dataloader.dataset 59 | self.log(runner, eval_dataset, mode="eval") 60 | 61 | def before_test(self, runner) -> None: 62 | test_dataset = runner.test_dataloader.dataset 63 | self.log(runner, test_dataset, mode="test") 64 | -------------------------------------------------------------------------------- /xtuner/engine/runner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .loops import TrainLoop 3 | 4 | __all__ = ["TrainLoop"] 5 | -------------------------------------------------------------------------------- /xtuner/engine/runner/loops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Dict, Optional, Union 3 | 4 | from mmengine.runner import IterBasedTrainLoop 5 | from torch.utils.data import DataLoader 6 | 7 | 8 | class TrainLoop(IterBasedTrainLoop): 9 | def __init__( 10 | self, 11 | runner, 12 | dataloader: Union[DataLoader, Dict], 13 | max_iters: Optional[int] = None, 14 | max_epochs: Union[int, float] = None, 15 | **kwargs, 16 | ) -> None: 17 | if max_iters is None and max_epochs is None: 18 | raise RuntimeError( 19 | "Please specify the `max_iters` or " "`max_epochs` in `train_cfg`." 20 | ) 21 | elif max_iters is not None and max_epochs is not None: 22 | raise RuntimeError( 23 | "Only one of `max_iters` or `max_epochs` can " "exist in `train_cfg`." 24 | ) 25 | else: 26 | if max_iters is not None: 27 | iters = int(max_iters) 28 | assert iters == max_iters, ( 29 | "`max_iters` should be a integer " f"number, but get {max_iters}" 30 | ) 31 | elif max_epochs is not None: 32 | if isinstance(dataloader, dict): 33 | diff_rank_seed = runner._randomness_cfg.get("diff_rank_seed", False) 34 | dataloader = runner.build_dataloader( 35 | dataloader, seed=runner.seed, diff_rank_seed=diff_rank_seed 36 | ) 37 | iters = max_epochs * len(dataloader) 38 | else: 39 | raise NotImplementedError 40 | super().__init__( 41 | runner=runner, dataloader=dataloader, max_iters=iters, **kwargs 42 | ) 43 | -------------------------------------------------------------------------------- /xtuner/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .metrics import MMLUMetric 3 | 4 | __all__ = ["MMLUMetric"] 5 | -------------------------------------------------------------------------------- /xtuner/evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .mmlu_metric import MMLUMetric 3 | 4 | __all__ = ["MMLUMetric"] 5 | -------------------------------------------------------------------------------- /xtuner/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .internvl import InternVL_V1_5 3 | from .llava import LLaVAModel 4 | from .sft import SupervisedFinetune 5 | 6 | __all__ = ["SupervisedFinetune", "LLaVAModel", "InternVL_V1_5"] 7 | -------------------------------------------------------------------------------- /xtuner/model/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatch import dispatch_modules 3 | from .projector import ProjectorConfig, ProjectorModel 4 | 5 | __all__ = ["dispatch_modules", "ProjectorConfig", "ProjectorModel"] 6 | -------------------------------------------------------------------------------- /xtuner/model/modules/dispatch/triton_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .layer_norm import layer_norm_forward 3 | from .rms_norm import rms_norm_forward 4 | from .rotary import apply_rotary_emb 5 | 6 | __all__ = ["rms_norm_forward", "layer_norm_forward", "apply_rotary_emb"] 7 | -------------------------------------------------------------------------------- /xtuner/model/modules/dispatch/triton_kernels/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def layer_norm_forward(self, hidden_states): 7 | input_dtype = hidden_states.dtype 8 | hidden_states = hidden_states.to(torch.float32) 9 | hidden_states = F.layer_norm( 10 | hidden_states, (hidden_states.shape[-1],), eps=self.variance_epsilon 11 | ) 12 | hidden_states = self.weight.to(torch.float32) * hidden_states 13 | return hidden_states.to(input_dtype) 14 | -------------------------------------------------------------------------------- /xtuner/model/modules/dispatch/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | try: 6 | from flash_attn.bert_padding import index_first_axis, unpad_input 7 | except ImportError: 8 | pass 9 | 10 | 11 | def _get_unpad_data(attention_mask): 12 | seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) 13 | indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() 14 | max_seqlen_in_batch = seqlens_in_batch.max().item() 15 | cu_seqlens = F.pad( 16 | torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0) 17 | ) 18 | return ( 19 | indices, 20 | cu_seqlens, 21 | max_seqlen_in_batch, 22 | ) 23 | 24 | 25 | def upad_qkv(query_layer, key_layer, value_layer, attention_mask, query_length): 26 | indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) 27 | batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape 28 | 29 | key_layer = index_first_axis( 30 | key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), 31 | indices_k, 32 | ) 33 | value_layer = index_first_axis( 34 | value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), 35 | indices_k, 36 | ) 37 | if query_length == kv_seq_len: 38 | # Different from the origin version as sequence parallel change 39 | # the number of attention heads. 40 | query_layer = index_first_axis( 41 | query_layer.reshape(batch_size * kv_seq_len, -1, head_dim), indices_k 42 | ) 43 | cu_seqlens_q = cu_seqlens_k 44 | max_seqlen_in_batch_q = max_seqlen_in_batch_k 45 | indices_q = indices_k 46 | elif query_length == 1: 47 | max_seqlen_in_batch_q = 1 48 | cu_seqlens_q = torch.arange( 49 | batch_size + 1, dtype=torch.int32, device=query_layer.device 50 | ) # There is a memcpy here, that is very bad. 51 | indices_q = cu_seqlens_q[:-1] 52 | query_layer = query_layer.squeeze(1) 53 | else: 54 | # The -q_len: slice assumes left padding. 55 | attention_mask = attention_mask[:, -query_length:] 56 | query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( 57 | query_layer, attention_mask 58 | ) 59 | 60 | return ( 61 | query_layer, 62 | key_layer, 63 | value_layer, 64 | indices_q, 65 | (cu_seqlens_q, cu_seqlens_k), 66 | (max_seqlen_in_batch_q, max_seqlen_in_batch_k), 67 | ) 68 | -------------------------------------------------------------------------------- /xtuner/model/modules/projector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import AutoConfig, AutoModel 3 | 4 | from .configuration_projector import ProjectorConfig 5 | from .modeling_projector import ProjectorModel 6 | 7 | AutoConfig.register("projector", ProjectorConfig) 8 | AutoModel.register(ProjectorConfig, ProjectorModel) 9 | 10 | __all__ = ["ProjectorConfig", "ProjectorModel"] 11 | -------------------------------------------------------------------------------- /xtuner/model/modules/projector/configuration_projector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import PretrainedConfig 3 | 4 | 5 | class ProjectorConfig(PretrainedConfig): 6 | model_type = "projector" 7 | _auto_class = "AutoConfig" 8 | 9 | def __init__( 10 | self, 11 | visual_hidden_size=4096, 12 | llm_hidden_size=4096, 13 | depth=2, 14 | hidden_act="gelu", 15 | bias=True, 16 | **kwargs, 17 | ): 18 | self.visual_hidden_size = visual_hidden_size 19 | self.llm_hidden_size = llm_hidden_size 20 | self.depth = depth 21 | self.hidden_act = hidden_act 22 | self.bias = bias 23 | super().__init__(**kwargs) 24 | -------------------------------------------------------------------------------- /xtuner/model/modules/projector/modeling_projector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | from transformers import PreTrainedModel 5 | from transformers.activations import ACT2FN 6 | 7 | from .configuration_projector import ProjectorConfig 8 | 9 | 10 | class ProjectorModel(PreTrainedModel): 11 | _auto_class = "AutoModel" 12 | config_class = ProjectorConfig 13 | base_model_prefix = "model" 14 | supports_gradient_checkpointing = True 15 | 16 | def __init__(self, config: ProjectorConfig) -> None: 17 | super().__init__(config) 18 | self.gradient_checkpointing = False 19 | 20 | modules = [ 21 | nn.Linear( 22 | config.visual_hidden_size, config.llm_hidden_size, bias=config.bias 23 | ) 24 | ] 25 | for _ in range(1, config.depth): 26 | modules.append(ACT2FN[config.hidden_act]) 27 | modules.append( 28 | nn.Linear( 29 | config.llm_hidden_size, config.llm_hidden_size, bias=config.bias 30 | ) 31 | ) 32 | self.model = nn.Sequential(*modules) 33 | 34 | def enable_input_require_grads(self): 35 | def make_inputs_require_grad(module, input, output): 36 | output.requires_grad_(True) 37 | 38 | self.model.register_forward_hook(make_inputs_require_grad) 39 | 40 | def _set_gradient_checkpointing(self, module, value=False): 41 | if isinstance(module, ProjectorModel): 42 | module.gradient_checkpointing = value 43 | 44 | def forward(self, x): 45 | if self.gradient_checkpointing and self.training: 46 | layer_outputs = torch.utils.checkpoint.checkpoint(self.model, x) 47 | else: 48 | layer_outputs = self.model(x) 49 | return layer_outputs 50 | -------------------------------------------------------------------------------- /xtuner/model/transformers_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepseek_v2 import DeepseekTokenizerFast, DeepseekV2Config, DeepseekV2ForCausalLM, DeepseekV2Model 2 | from .mixtral import MixtralConfig, MixtralForCausalLM, MixtralModel 3 | 4 | __all__ = [ 5 | 'DeepseekTokenizerFast', 'DeepseekV2Config', 'DeepseekV2ForCausalLM', 'DeepseekV2Model', 'MixtralConfig', 6 | 'MixtralForCausalLM', 'MixtralModel' 7 | ] 8 | -------------------------------------------------------------------------------- /xtuner/model/transformers_models/deepseek_v2/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_deepseek import DeepseekV2Config 2 | from .modeling_deepseek import DeepseekV2ForCausalLM, DeepseekV2Model 3 | from .tokenization_deepseek_fast import DeepseekTokenizerFast 4 | 5 | __all__ = ['DeepseekV2ForCausalLM', 'DeepseekV2Model', 'DeepseekV2Config', 'DeepseekTokenizerFast'] 6 | -------------------------------------------------------------------------------- /xtuner/model/transformers_models/deepseek_v2/tokenization_deepseek_fast.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | from transformers.models.llama import LlamaTokenizerFast 4 | 5 | 6 | class DeepseekTokenizerFast(LlamaTokenizerFast): 7 | 8 | def convert_ids_to_tokens(self, 9 | ids: Union[int, List[int]], 10 | skip_special_tokens: bool = False) -> Union[str, List[str]]: 11 | """Converts a single index or a sequence of indices in a token or a 12 | sequence of tokens, using the vocabulary and added tokens. 13 | 14 | Args: 15 | ids (`int` or `List[int]`): 16 | The token id (or token ids) to convert to tokens. 17 | skip_special_tokens (`bool`, *optional*, defaults to `False`): 18 | Whether or not to remove special tokens in the decoding. 19 | 20 | Returns: 21 | `str` or `List[str]`: The decoded token(s). 22 | """ 23 | if isinstance(ids, int): 24 | return self._convert_id_to_token(ids) 25 | tokens = [] 26 | for index in ids: 27 | index = int(index) 28 | if skip_special_tokens and index in self.all_special_ids: 29 | continue 30 | token = self._tokenizer.id_to_token(index) 31 | tokens.append(token if token is not None else '') 32 | return tokens 33 | 34 | def _convert_id_to_token(self, index: int) -> Optional[str]: 35 | token = self._tokenizer.id_to_token(int(index)) 36 | return token if token is not None else '' 37 | -------------------------------------------------------------------------------- /xtuner/model/transformers_models/mixtral/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_mixtral import MixtralConfig 2 | from .modeling_mixtral import MixtralForCausalLM, MixtralModel 3 | 4 | __all__ = ['MixtralForCausalLM', 'MixtralModel', 'MixtralConfig'] 5 | -------------------------------------------------------------------------------- /xtuner/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .sequence import * # noqa: F401, F403 3 | -------------------------------------------------------------------------------- /xtuner/parallel/sequence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.dist import init_dist 3 | 4 | from .attention import ( 5 | post_process_for_sequence_parallel_attn, 6 | pre_process_for_sequence_parallel_attn, 7 | sequence_parallel_wrapper, 8 | ) 9 | from .comm import ( 10 | all_to_all, 11 | gather_for_sequence_parallel, 12 | gather_forward_split_backward, 13 | split_for_sequence_parallel, 14 | split_forward_gather_backward, 15 | ) 16 | from .data_collate import ( 17 | pad_cumulative_len_for_sequence_parallel, 18 | pad_for_sequence_parallel, 19 | ) 20 | from .reduce_loss import reduce_sequence_parallel_loss 21 | from .sampler import SequenceParallelSampler 22 | from .setup_distributed import ( 23 | get_data_parallel_group, 24 | get_data_parallel_rank, 25 | get_data_parallel_world_size, 26 | get_inner_sequence_parallel_group, 27 | get_inner_sequence_parallel_rank, 28 | get_inner_sequence_parallel_world_size, 29 | get_sequence_parallel_group, 30 | get_sequence_parallel_rank, 31 | get_sequence_parallel_world_size, 32 | init_inner_sequence_parallel, 33 | init_sequence_parallel, 34 | is_inner_sequence_parallel_initialized, 35 | ) 36 | 37 | __all__ = [ 38 | "sequence_parallel_wrapper", 39 | "pre_process_for_sequence_parallel_attn", 40 | "post_process_for_sequence_parallel_attn", 41 | "pad_for_sequence_parallel", 42 | "split_for_sequence_parallel", 43 | "SequenceParallelSampler", 44 | "init_sequence_parallel", 45 | "get_sequence_parallel_group", 46 | "get_sequence_parallel_world_size", 47 | "get_sequence_parallel_rank", 48 | "get_data_parallel_group", 49 | "get_data_parallel_world_size", 50 | "get_data_parallel_rank", 51 | "reduce_sequence_parallel_loss", 52 | "init_dist", 53 | "all_to_all", 54 | "gather_for_sequence_parallel", 55 | "split_forward_gather_backward", 56 | "gather_forward_split_backward", 57 | "get_inner_sequence_parallel_group", 58 | "get_inner_sequence_parallel_rank", 59 | "get_inner_sequence_parallel_world_size", 60 | "init_inner_sequence_parallel", 61 | "is_inner_sequence_parallel_initialized", 62 | "pad_cumulative_len_for_sequence_parallel", 63 | ] 64 | -------------------------------------------------------------------------------- /xtuner/parallel/sequence/data_collate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from .setup_distributed import get_sequence_parallel_world_size 5 | 6 | 7 | def pad_for_sequence_parallel(tensor, padding_value, dim=-1): 8 | length = tensor.shape[dim] 9 | seq_parallel_world_size = get_sequence_parallel_world_size() 10 | if length % seq_parallel_world_size == 0: 11 | return tensor 12 | 13 | pad_num = seq_parallel_world_size - (length % seq_parallel_world_size) 14 | pad_shape = ( 15 | (*tensor.shape[:dim], pad_num, *tensor.shape[dim + 1 :]) 16 | if dim != -1 17 | else (*tensor.shape[:dim], pad_num) 18 | ) 19 | pad = torch.full(pad_shape, padding_value, dtype=tensor.dtype, device=tensor.device) 20 | tensor = torch.cat([tensor, pad], dim=dim) 21 | return tensor 22 | 23 | 24 | # This function only meets the following two conditions: 25 | # 1. use_varlen_attn = True 26 | # 2. pack_to_max_length = True and the lengths of each sequence are different 27 | def pad_cumulative_len_for_sequence_parallel(cumulative_len): 28 | assert len(cumulative_len) == 1 29 | seqlen = cumulative_len[0][-1] 30 | seq_parallel_world_size = get_sequence_parallel_world_size() 31 | if seqlen % seq_parallel_world_size == 0: 32 | return cumulative_len, None 33 | 34 | bs = len(cumulative_len) 35 | pad_len = seq_parallel_world_size - (seqlen % seq_parallel_world_size) 36 | seqlen_new = seqlen + pad_len 37 | attention_mask = torch.zeros( 38 | bs, seqlen_new, dtype=torch.bool, device=cumulative_len[0].device 39 | ) 40 | attention_mask[:, :seqlen] = True 41 | 42 | for i, cu_len in enumerate(cumulative_len): 43 | pad = torch.tensor([seqlen_new], device=cu_len.device, dtype=cu_len.dtype) 44 | cumulative_len[i] = torch.cat([cu_len, pad], dim=0) 45 | 46 | return cumulative_len, attention_mask 47 | -------------------------------------------------------------------------------- /xtuner/parallel/sequence/reduce_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.distributed as dist 4 | 5 | from .setup_distributed import get_sequence_parallel_group 6 | 7 | 8 | class _ReduceLoss(torch.autograd.Function): 9 | @staticmethod 10 | def forward(ctx, mean_loss, loss_scale, process_group): 11 | ctx.mode = process_group 12 | if loss_scale == 0: 13 | # convert nan to 0 just for logging 14 | mean_loss = torch.nan_to_num(mean_loss) 15 | loss_sum = mean_loss * loss_scale 16 | dist.all_reduce(loss_sum, group=process_group) 17 | dist.all_reduce(loss_scale, group=process_group) 18 | loss = loss_sum / loss_scale 19 | return loss 20 | 21 | @staticmethod 22 | def backward(ctx, grad_output): 23 | return grad_output, None, None 24 | 25 | 26 | def reduce_sequence_parallel_loss( 27 | mean_loss, loss_scale, sp_group: dist.ProcessGroup = None 28 | ): 29 | if dist.get_world_size(sp_group) == 1: 30 | return mean_loss 31 | if sp_group is None: 32 | # avoid bc breaking 33 | sp_group = get_sequence_parallel_group() 34 | return _ReduceLoss.apply(mean_loss, loss_scale, sp_group) 35 | -------------------------------------------------------------------------------- /xtuner/parallel/sequence/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import math 3 | from typing import Optional, Sized 4 | 5 | from mmengine.dataset import DefaultSampler 6 | from mmengine.dist import sync_random_seed 7 | 8 | from .setup_distributed import get_data_parallel_rank, get_data_parallel_world_size 9 | 10 | 11 | class SequenceParallelSampler(DefaultSampler): 12 | def __init__( 13 | self, 14 | dataset: Sized, 15 | shuffle: bool = True, 16 | seed: Optional[int] = None, 17 | round_up: bool = True, 18 | ) -> None: 19 | rank = get_data_parallel_rank() 20 | world_size = get_data_parallel_world_size() 21 | self.rank = rank 22 | self.world_size = world_size 23 | 24 | self.dataset = dataset 25 | self.shuffle = shuffle 26 | if seed is None: 27 | seed = sync_random_seed() 28 | self.seed = seed 29 | self.epoch = 0 30 | self.round_up = round_up 31 | 32 | if self.round_up: 33 | self.num_samples = math.ceil(len(self.dataset) / world_size) 34 | self.total_size = self.num_samples * self.world_size 35 | else: 36 | self.num_samples = math.ceil((len(self.dataset) - rank) / world_size) 37 | self.total_size = len(self.dataset) 38 | -------------------------------------------------------------------------------- /xtuner/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.registry import Registry 3 | 4 | __all__ = ["BUILDER", "MAP_FUNC"] 5 | 6 | BUILDER = Registry("builder") 7 | MAP_FUNC = Registry("map_fn") 8 | -------------------------------------------------------------------------------- /xtuner/tools/copy_cfg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os.path as osp 4 | import shutil 5 | 6 | from mmengine.utils import mkdir_or_exist 7 | 8 | from xtuner.configs import cfgs_name_path 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("config_name", help="config name") 14 | parser.add_argument("save_dir", help="save directory for copied config") 15 | args = parser.parse_args() 16 | return args 17 | 18 | 19 | def add_copy_suffix(string): 20 | file_name, ext = osp.splitext(string) 21 | return f"{file_name}_copy{ext}" 22 | 23 | 24 | def main(): 25 | args = parse_args() 26 | mkdir_or_exist(args.save_dir) 27 | config_path = cfgs_name_path[args.config_name] 28 | save_path = osp.join(args.save_dir, add_copy_suffix(osp.basename(config_path))) 29 | shutil.copyfile(config_path, save_path) 30 | print(f"Copy to {save_path}") 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /xtuner/tools/data_preprocess/arxiv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import json 4 | from datetime import datetime 5 | 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("src_file", help="source file path") 10 | parser.add_argument("dst_file", help="destination file path") 11 | parser.add_argument( 12 | "--categories", 13 | nargs="+", 14 | default=["cs.AI", "cs.CL", "cs.CV"], 15 | help="target categories", 16 | ) 17 | parser.add_argument( 18 | "--start-date", default="2020-01-01", help="start date (format: YYYY-MM-DD)" 19 | ) 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | def has_intersection(list1, list2): 26 | set1 = set(list1) 27 | set2 = set(list2) 28 | return len(set1.intersection(set2)) > 0 29 | 30 | 31 | def read_json_file(file_path): 32 | data = [] 33 | with open(file_path) as file: 34 | for line in file: 35 | try: 36 | json_data = json.loads(line) 37 | data.append(json_data) 38 | except json.JSONDecodeError: 39 | print(f"Failed to parse line: {line}") 40 | return data 41 | 42 | 43 | def main(): 44 | args = parse_args() 45 | json_data = read_json_file(args.src_file) 46 | from_time = datetime.strptime(args.start_date, "%Y-%m-%d") 47 | filtered_data = [ 48 | item 49 | for item in json_data 50 | if has_intersection(args.categories, item["categories"].split()) 51 | and datetime.strptime(item["update_date"], "%Y-%m-%d") >= from_time 52 | ] 53 | 54 | with open(args.dst_file, "w") as file: 55 | json.dump(filtered_data, file) 56 | 57 | print(f"Save to {args.dst_file}\n{len(filtered_data)} items") 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /xtuner/tools/data_preprocess/convert_refcoco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import json 4 | 5 | from xtuner.dataset.refcoco_json import RefCOCOJsonDataset 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | "--ann-path", 12 | default="data/refcoco_annotations", 13 | help="Refcoco annotation path", 14 | ) 15 | parser.add_argument( 16 | "--image-path", 17 | default="data/llava_data/llava_images/coco/train2017", 18 | help="COCO image path", 19 | ) 20 | parser.add_argument( 21 | "--save-path", default="./", help="The folder to save converted data" 22 | ) 23 | args = parser.parse_args() 24 | return args 25 | 26 | 27 | if __name__ == "__main__": 28 | args = parse_args() 29 | 30 | data_info = [ 31 | ("refcoco", "unc"), 32 | ("refcoco+", "unc"), 33 | ("refcocog", "umd"), 34 | ] 35 | all_data = [] 36 | for dataset, split in data_info: 37 | data = RefCOCOJsonDataset.get_data_json( 38 | ann_path=args.ann_path, 39 | image_path=args.image_path, 40 | dataset=dataset, 41 | splitBy=split, 42 | )[0] 43 | all_data.extend(data) 44 | save_path = args.save_path + "/train.json" 45 | with open(save_path, "w") as f: 46 | print(f"save to {save_path} with {len(all_data)} items.") 47 | print(all_data[0]) 48 | json.dump(all_data, f, indent=4) 49 | -------------------------------------------------------------------------------- /xtuner/tools/get_data_order.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--data-folder", help="Data folder") 9 | parser.add_argument("--save-folder", help="The folder to save data order.") 10 | parser.add_argument( 11 | "--file-type", 12 | default=".bin", 13 | help="We want to get the order of the file in this type.", 14 | ) 15 | args = parser.parse_args() 16 | return args 17 | 18 | 19 | def save_data_order(data_folder, save_folder, file_type=".bin"): 20 | assert os.path.exists(data_folder), f"{data_folder} does not exist." 21 | triples = list(os.walk(data_folder, followlinks=True)) 22 | data_order = [] 23 | for root, dirs, files in triples: 24 | dirs.sort() 25 | print(f"Reading {root}...") 26 | for fn in sorted(files): 27 | if fn.endswith(file_type): 28 | fp = os.path.join(root, fn) 29 | # Using relative paths so that you can get the same result 30 | # on different clusters 31 | fp = fp.replace(data_folder, "")[1:] 32 | data_order.append(fp) 33 | 34 | save_path = os.path.join(save_folder, "data_order.txt") 35 | with open(save_path, "w") as f: 36 | for fp in data_order: 37 | f.write(fp + "\n") 38 | 39 | 40 | if __name__ == "__main__": 41 | args = parse_args() 42 | save_data_order(args.data_folder, args.save_folder, args.file_type) 43 | -------------------------------------------------------------------------------- /xtuner/tools/list_cfg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | 4 | from xtuner.configs import cfgs_name_path 5 | 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument( 10 | "-p", "--pattern", default=None, help="Pattern for fuzzy matching" 11 | ) 12 | args = parser.parse_args() 13 | return args 14 | 15 | 16 | def main(pattern=None): 17 | args = parse_args() 18 | configs_names = sorted(list(cfgs_name_path.keys())) 19 | print("==========================CONFIGS===========================") 20 | if args.pattern is not None: 21 | print(f"PATTERN: {args.pattern}") 22 | print("-------------------------------") 23 | for name in configs_names: 24 | if args.pattern is None or args.pattern.lower() in name.lower(): 25 | print(name) 26 | print("=============================================================") 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /xtuner/tools/list_dataset_format.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.dataset.map_fns import DATASET_FORMAT_MAPPING 3 | 4 | 5 | def main(): 6 | dataset_format = DATASET_FORMAT_MAPPING.keys() 7 | print("======================DATASET_FORMAT======================") 8 | for format in dataset_format: 9 | print(format) 10 | print("==========================================================") 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /xtuner/tools/log_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | 4 | from mmengine.config import Config 5 | 6 | from xtuner.registry import BUILDER 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description="Log processed dataset.") 11 | parser.add_argument("config", help="config file name or path.") 12 | # chose which kind of dataset style to show 13 | parser.add_argument( 14 | "--show", 15 | default="text", 16 | choices=["text", "masked_text", "input_ids", "labels", "all"], 17 | help="which kind of dataset style to show", 18 | ) 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | cfg = Config.fromfile(args.config) 27 | 28 | tokenizer = BUILDER.build(cfg.tokenizer) 29 | if cfg.get("framework", "mmengine").lower() == "huggingface": 30 | train_dataset = BUILDER.build(cfg.train_dataset) 31 | else: 32 | train_dataset = BUILDER.build(cfg.train_dataloader.dataset) 33 | 34 | if args.show == "text" or args.show == "all": 35 | print("#" * 20 + " text " + "#" * 20) 36 | print(tokenizer.decode(train_dataset[0]["input_ids"])) 37 | if args.show == "masked_text" or args.show == "all": 38 | print("#" * 20 + " text(masked) " + "#" * 20) 39 | masked_text = " ".join( 40 | ["[-100]" for i in train_dataset[0]["labels"] if i == -100] 41 | ) 42 | unmasked_text = tokenizer.decode( 43 | [i for i in train_dataset[0]["labels"] if i != -100] 44 | ) 45 | print(masked_text + " " + unmasked_text) 46 | if args.show == "input_ids" or args.show == "all": 47 | print("#" * 20 + " input_ids " + "#" * 20) 48 | print(train_dataset[0]["input_ids"]) 49 | if args.show == "labels" or args.show == "all": 50 | print("#" * 20 + " labels " + "#" * 20) 51 | print(train_dataset[0]["labels"]) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /xtuner/tools/model_converters/modeling_internlm2_reward/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /xtuner/tools/model_converters/split.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import copy 4 | import json 5 | import os 6 | import os.path as osp 7 | import shutil 8 | 9 | import torch 10 | from mmengine.utils import mkdir_or_exist 11 | 12 | from xtuner.utils.device import get_device_name, get_torch_device 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser( 17 | description="Split a HuggingFace model to the smallest sharded one" 18 | ) 19 | parser.add_argument("src_dir", help="the directory of the model") 20 | parser.add_argument("dst_dir", help="the directory to save the new model") 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | mkdir_or_exist(args.dst_dir) 28 | 29 | all_files = os.listdir(args.src_dir) 30 | for name in all_files: 31 | if not name.startswith(("pytorch_model", ".")): 32 | src_path = osp.join(args.src_dir, name) 33 | dst_path = osp.join(args.dst_dir, name) 34 | shutil.copy(src_path, dst_path) 35 | 36 | with open(osp.join(args.src_dir, "pytorch_model.bin.index.json")) as f: 37 | index = json.load(f) 38 | 39 | n_shard = len(index["weight_map"]) 40 | new_index = copy.deepcopy(index) 41 | new_index["weight_map"] = {} 42 | cnt = 1 43 | 44 | checkpoints = set(index["weight_map"].values()) 45 | for ckpt in checkpoints: 46 | state_dict = torch.load( 47 | osp.join(args.src_dir, ckpt), map_location=get_device_name() 48 | ) 49 | keys = sorted(list(state_dict.keys())) 50 | for k in keys: 51 | new_state_dict_name = f"pytorch_model-{cnt:05d}-of-{n_shard:05d}.bin" 52 | new_index["weight_map"][k] = new_state_dict_name 53 | new_state_dict = {k: state_dict[k]} 54 | torch.save(new_state_dict, osp.join(args.dst_dir, new_state_dict_name)) 55 | cnt += 1 56 | del state_dict 57 | get_torch_device().empty_cache() 58 | with open(osp.join(args.dst_dir, "pytorch_model.bin.index.json"), "w") as f: 59 | json.dump(new_index, f) 60 | assert ( 61 | new_index["weight_map"].keys() == index["weight_map"].keys() 62 | ), "Mismatch on `weight_map`!" 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /xtuner/tools/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .api import plugins_api 3 | 4 | __all__ = ["plugins_api"] 5 | -------------------------------------------------------------------------------- /xtuner/tools/plugins/api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import re 3 | 4 | 5 | def plugins_api(input_str, calculate_open=True, solve_open=True, search_open=True): 6 | pattern = r'(Solve|solve|Solver|solver|Calculate|calculate|Calculator|calculator|Search)\("([^"]*)"\)' # noqa: E501 7 | 8 | matches = re.findall(pattern, input_str) 9 | 10 | converted_str = "<|Results|>:\n" 11 | 12 | for i in range(len(matches)): 13 | if matches[i][0] in ["Calculate", "calculate" "Calculator", "calculator"]: 14 | if calculate_open: 15 | from .calculate import Calculate 16 | 17 | result = Calculate(matches[i][1]) 18 | else: 19 | result = None 20 | converted_str += f'Calculate("{matches[i][1]}") => {result}\n' 21 | elif matches[i][0] in ["Solve", "solve", "Solver", "solver"]: 22 | if solve_open: 23 | from .solve import Solve 24 | 25 | result = Solve(matches[i][1]) 26 | else: 27 | result = None 28 | converted_str += f'Solve("{matches[i][1]}") =>\n{result}\n' 29 | elif matches[i][0] == "Search": 30 | if search_open: 31 | from .search import Search 32 | 33 | result = Search(matches[i][1]) 34 | else: 35 | result = None 36 | converted_str += f'Search("{matches[i][1]}") =>\n{result}' 37 | 38 | converted_str += "\n" 39 | return converted_str 40 | -------------------------------------------------------------------------------- /xtuner/tools/plugins/calculate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from math import * # noqa: F401, F403 3 | 4 | 5 | def Calculate(expression): 6 | res = "" 7 | for exp in expression.split(";"): 8 | try: 9 | res += "{:.2f};".format(eval(exp.replace("^", "**"))) 10 | except Exception: 11 | res += "No result." 12 | if res[-1] == ";": 13 | res = res[:-1] 14 | return res 15 | -------------------------------------------------------------------------------- /xtuner/tools/plugins/search.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | import sys 4 | 5 | import requests 6 | 7 | try: 8 | SERPER_API_KEY = os.environ["SERPER_API_KEY"] 9 | except Exception: 10 | print( 11 | "Please obtain the `SERPER_API_KEY` from https://serper.dev and " 12 | "set it using `export SERPER_API_KEY=xxx`." 13 | ) 14 | sys.exit(1) 15 | 16 | 17 | def parse_results(results, k=10): 18 | snippets = [] 19 | 20 | for result in results["organic"][:k]: 21 | if "snippet" in result: 22 | snippets.append(result["snippet"]) 23 | for attribute, value in result.get("attributes", {}).items(): 24 | snippets.append(f"{attribute}: {value}.") 25 | return snippets 26 | 27 | 28 | def search(api_key, search_term, **kwargs): 29 | headers = { 30 | "X-API-KEY": api_key, 31 | "Content-Type": "application/json", 32 | } 33 | params = { 34 | "q": search_term, 35 | **{key: value for key, value in kwargs.items() if value is not None}, 36 | } 37 | try: 38 | response = requests.post( 39 | "https://google.serper.dev/search", 40 | headers=headers, 41 | params=params, 42 | timeout=5, 43 | ) 44 | except Exception as e: 45 | return -1, str(e) 46 | return response.status_code, response.json() 47 | 48 | 49 | def Search(q, k=10): 50 | status_code, response = search(SERPER_API_KEY, q) 51 | if status_code != 200: 52 | ret = "None\n" 53 | else: 54 | text = parse_results(response, k=k) 55 | ret = "" 56 | for idx, res in enumerate(text): 57 | ret += f"<|{idx+1}|>: '{res}'\n" 58 | return ret 59 | -------------------------------------------------------------------------------- /xtuner/tools/plugins/solve.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import math 3 | import re 4 | from math import * # noqa: F401, F403 5 | 6 | from sympy import Eq, solve, symbols 7 | 8 | from .calculate import Calculate 9 | 10 | 11 | def Solve(equations_str): 12 | try: 13 | equations_str = equations_str.replace(" ", "") 14 | equations_ori = re.split(r"[,;]+", equations_str) 15 | equations_str = equations_str.replace("^", "**") 16 | equations_str = re.sub(r"(\(.*\))([a-zA-Z])", r"\1 * \2", equations_str) 17 | equations_str = re.sub(r"(\d+)([a-zA-Z])", r"\1 * \2", equations_str) 18 | equations_str = equations_str.replace("pi", str(math.pi)) 19 | equations = re.split(r"[,;]+", equations_str) 20 | vars_list = list(set(re.findall(r"[a-zA-Z]+", equations_str))) 21 | vars = {var: symbols(var) for var in vars_list} 22 | 23 | output = "" 24 | eqs = [] 25 | for eq in equations: 26 | if "=" in eq: 27 | left, right = eq.split("=") 28 | eqs.append( 29 | Eq(eval(left.strip(), {}, vars), eval(right.strip(), {}, vars)) 30 | ) 31 | solutions = solve(eqs, vars, dict=True) 32 | 33 | vars_values = {var: [] for var in vars_list} 34 | if isinstance(solutions, list): 35 | for idx, solution in enumerate(solutions): 36 | for var, sol in solution.items(): 37 | output += f"{var}_{idx} = {sol}\n" 38 | vars_values[str(var)].append(sol) 39 | else: 40 | for var, sol in solutions.items(): 41 | output += f"{var} = {sol}\n" 42 | vars_values[str(var)].append(sol) 43 | for eq, eq_o in zip(equations, equations_ori): 44 | if "=" not in eq: 45 | for var in vars_list: 46 | need_note = True if len(vars_values[var]) > 1 else False 47 | for idx, value in enumerate(vars_values[var]): 48 | eq_to_calc = eq.replace(var, str(value)) 49 | calc_result = Calculate(eq_to_calc) 50 | if need_note: 51 | eq_name = eq_o.replace(var, f"{var}_{idx}") 52 | else: 53 | eq_name = eq_o 54 | if calc_result != "No results.": 55 | output += f"{eq_name} = {calc_result}\n" 56 | 57 | return output.strip() 58 | except Exception: 59 | return "No result." 60 | -------------------------------------------------------------------------------- /xtuner/tools/process_untokenized_llava_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import warnings 4 | 5 | from mmengine import Config 6 | 7 | from xtuner.registry import BUILDER 8 | 9 | # ignore FutureWarning in hf datasets 10 | warnings.simplefilter(action="ignore", category=FutureWarning) 11 | 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("config", help="config file name or path.") 16 | parser.add_argument("--save-folder", help="The folder to save data order.") 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | def build_llava_dataset(config): 22 | dataset = BUILDER.build(config.train_dataloader.dataset) 23 | return dataset 24 | 25 | 26 | if __name__ == "__main__": 27 | args = parse_args() 28 | cfg = Config.fromfile(args.config) 29 | 30 | llava_dataset = build_llava_dataset(cfg) 31 | text_data = llava_dataset.text_data 32 | 33 | text_data.save_to_disk(args.save_folder) 34 | -------------------------------------------------------------------------------- /xtuner/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .constants import ( 3 | DEFAULT_IMAGE_TOKEN, 4 | DEFAULT_PAD_TOKEN_INDEX, 5 | IGNORE_INDEX, 6 | IMAGE_TOKEN_INDEX, 7 | ) 8 | from .handle_moe_load_and_save import ( 9 | SUPPORT_MODELS, 10 | get_origin_state_dict, 11 | load_state_dict_into_model, 12 | ) 13 | from .stop_criteria import StopWordStoppingCriteria 14 | from .templates import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 15 | 16 | __all__ = [ 17 | "IGNORE_INDEX", 18 | "DEFAULT_PAD_TOKEN_INDEX", 19 | "PROMPT_TEMPLATE", 20 | "DEFAULT_IMAGE_TOKEN", 21 | "SYSTEM_TEMPLATE", 22 | "StopWordStoppingCriteria", 23 | "IMAGE_TOKEN_INDEX", 24 | "load_state_dict_into_model", 25 | "get_origin_state_dict", 26 | "SUPPORT_MODELS", 27 | ] 28 | -------------------------------------------------------------------------------- /xtuner/utils/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | IGNORE_INDEX = -100 3 | DEFAULT_PAD_TOKEN_INDEX = 0 4 | IMAGE_TOKEN_INDEX = -200 5 | DEFAULT_IMAGE_TOKEN = "" 6 | -------------------------------------------------------------------------------- /xtuner/utils/device.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # This code is inspired by the torchtune. 3 | # https://github.com/pytorch/torchtune/blob/main/torchtune/utils/_device.py 4 | 5 | import logging 6 | from typing import Optional 7 | 8 | import torch 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def is_torch_npu_available() -> bool: 14 | """Check the availability of NPU.""" 15 | try: 16 | import torch_npu # noqa: F401 17 | 18 | return torch.npu.is_available() 19 | except ImportError: 20 | return False 21 | 22 | 23 | is_cuda_available = torch.cuda.is_available() 24 | is_npu_available = is_torch_npu_available() 25 | 26 | 27 | def get_device_name() -> str: 28 | """Function that gets the torch.device based on the current machine. 29 | 30 | This currently only supports CPU, CUDA, NPU. 31 | 32 | Returns: 33 | device 34 | """ 35 | if is_cuda_available: 36 | device = "cuda" 37 | elif is_npu_available: 38 | device = "npu" 39 | else: 40 | device = "cpu" 41 | return device 42 | 43 | 44 | def get_device(device_name: Optional[str] = None) -> torch.device: 45 | """Function that takes an optional device string, verifies it's correct and 46 | available given the machine and distributed settings, and returns a 47 | :func:`~torch.device`. If device string is not provided, this function will 48 | infer the device based on the environment. 49 | 50 | If CUDA-like is available and being used, this function also sets the CUDA-like device. 51 | 52 | Args: 53 | device (Optional[str]): The name of the device to use, e.g. "cuda" or "cpu" or "npu". 54 | 55 | Example: 56 | >>> device = get_device("cuda") 57 | >>> device 58 | device(type='cuda', index=0) 59 | 60 | Returns: 61 | torch.device: Device 62 | """ 63 | if device_name is None: 64 | device_name = get_device_name() 65 | device = torch.device(device_name) 66 | return device 67 | 68 | 69 | def get_torch_device() -> any: 70 | """Return the corresponding torch attribute based on the device type 71 | string. 72 | 73 | Returns: 74 | module: The corresponding torch device namespace, or torch.cuda if not found. 75 | """ 76 | device_name = get_device_name() 77 | try: 78 | return getattr(torch, device_name) 79 | except AttributeError: 80 | logger.warning( 81 | f"Device namespace '{device_name}' not found in torch, try to load torch.cuda." 82 | ) 83 | return torch.cuda 84 | -------------------------------------------------------------------------------- /xtuner/utils/stop_criteria.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import StoppingCriteria 3 | 4 | 5 | class StopWordStoppingCriteria(StoppingCriteria): 6 | """StopWord stopping criteria.""" 7 | 8 | def __init__(self, tokenizer, stop_word): 9 | self.tokenizer = tokenizer 10 | self.stop_word = stop_word 11 | self.length = len(self.stop_word) 12 | 13 | def __call__(self, input_ids, *args, **kwargs) -> bool: 14 | cur_text = self.tokenizer.decode(input_ids[0]) 15 | cur_text = cur_text.replace("\r", "").replace("\n", "") 16 | return cur_text[-self.length :] == self.stop_word 17 | -------------------------------------------------------------------------------- /xtuner/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | __version__ = "0.2.0rc0" 3 | short_version = __version__ 4 | 5 | 6 | def parse_version_info(version_str): 7 | """Parse a version string into a tuple. 8 | 9 | Args: 10 | version_str (str): The version string. 11 | Returns: 12 | tuple[int or str]: The version info, e.g., "1.3.0" is parsed into 13 | (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). 14 | """ 15 | version_info = [] 16 | for x in version_str.split("."): 17 | if x.isdigit(): 18 | version_info.append(int(x)) 19 | elif x.find("rc") != -1: 20 | patch_version = x.split("rc") 21 | version_info.append(int(patch_version[0])) 22 | version_info.append(f"rc{patch_version[1]}") 23 | return tuple(version_info) 24 | 25 | 26 | version_info = parse_version_info(__version__) 27 | --------------------------------------------------------------------------------