├── README.md ├── eval ├── data │ ├── aime │ │ └── test.jsonl │ ├── amc │ │ └── test.jsonl │ ├── cn_math_2024 │ │ └── test.jsonl │ ├── gaokao │ │ └── test.jsonl │ ├── gpqa │ │ └── test.jsonl │ ├── grade_school_math │ │ └── test.jsonl │ ├── kaoyan │ │ └── test.jsonl │ ├── math │ │ └── test.jsonl │ ├── minerva │ │ └── test.jsonl │ └── olympiadbench │ │ └── test.jsonl ├── eval.py ├── eval.sh ├── prompt.txt ├── prompts │ └── qwen-instruct │ │ ├── aime.py │ │ ├── amc.py │ │ ├── gpqa.py │ │ ├── math.py │ │ ├── minerva.py │ │ └── olympiadbench.py ├── readme.md ├── requirements.txt └── utils │ ├── __pycache__ │ ├── grader.cpython-310.pyc │ ├── math_normalization.cpython-310.pyc │ └── parser.cpython-310.pyc │ ├── data_loader.py │ ├── examples.py │ ├── grader.py │ ├── math_normalization.py │ ├── parser.py │ └── utils.py ├── images └── limo.png └── train ├── CITATION.cff ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── README_zh.md ├── assets ├── benchmark.svg ├── logo.png ├── wechat.jpg └── wechat_npu.jpg ├── data ├── README.md ├── README_zh.md ├── alpaca_en_demo.json ├── alpaca_zh_demo.json ├── belle_multiturn │ └── belle_multiturn.py ├── c4_demo.json ├── dataset_info.json ├── dpo_en_demo.json ├── dpo_zh_demo.json ├── glaive_toolcall_en_demo.json ├── glaive_toolcall_zh_demo.json ├── hh_rlhf_en │ └── hh_rlhf_en.py ├── identity.json ├── kto_en_demo.json ├── limo.json ├── mllm_audio_demo.json ├── mllm_demo.json ├── mllm_demo_data │ ├── 1.jpg │ ├── 1.mp3 │ ├── 1.mp4 │ ├── 2.avi │ ├── 2.jpg │ ├── 2.wav │ ├── 3.flac │ ├── 3.jpg │ └── 3.mp4 ├── mllm_video_demo.json ├── ultra_chat │ └── ultra_chat.py └── wiki_demo.txt ├── docker ├── docker-cuda │ ├── Dockerfile │ └── docker-compose.yml ├── docker-npu │ ├── Dockerfile │ └── docker-compose.yml └── docker-rocm │ ├── Dockerfile │ └── docker-compose.yml ├── evaluation ├── ceval │ ├── ceval.py │ ├── ceval.zip │ └── mapping.json ├── cmmlu │ ├── cmmlu.py │ ├── cmmlu.zip │ └── mapping.json └── mmlu │ ├── mapping.json │ ├── mmlu.py │ └── mmlu.zip ├── examples ├── README.md ├── README_zh.md ├── accelerate │ └── fsdp_config.yaml ├── deepspeed │ ├── ds_z0_config.json │ ├── ds_z2_config.json │ ├── ds_z2_offload_config.json │ ├── ds_z3_config.json │ └── ds_z3_offload_config.json ├── extras │ ├── adam_mini │ │ └── qwen2_full_sft.yaml │ ├── apollo │ │ └── llama3_full_sft.yaml │ ├── badam │ │ └── llama3_full_sft.yaml │ ├── fsdp_qlora │ │ ├── llama3_lora_sft.yaml │ │ └── train.sh │ ├── galore │ │ └── llama3_full_sft.yaml │ ├── llama_pro │ │ ├── expand.sh │ │ └── llama3_freeze_sft.yaml │ ├── loraplus │ │ └── llama3_lora_sft.yaml │ ├── mod │ │ └── llama3_full_sft.yaml │ ├── nlg_eval │ │ └── llama3_lora_predict.yaml │ └── pissa │ │ ├── init.sh │ │ └── llama3_lora_sft.yaml ├── inference │ ├── llama3.yaml │ ├── llama3_full_sft.yaml │ ├── llama3_lora_sft.yaml │ ├── llama3_vllm.yaml │ ├── llava1_5.yaml │ └── qwen2_vl.yaml ├── merge_lora │ ├── llama3_gptq.yaml │ ├── llama3_lora_sft.yaml │ └── qwen2vl_lora_sft.yaml ├── train_full │ ├── llama3_full_sft.yaml │ └── qwen2vl_full_sft.yaml ├── train_limo.yaml ├── train_lora │ ├── llama3_lora_dpo.yaml │ ├── llama3_lora_eval.yaml │ ├── llama3_lora_kto.yaml │ ├── llama3_lora_ppo.yaml │ ├── llama3_lora_pretrain.yaml │ ├── llama3_lora_reward.yaml │ ├── llama3_lora_sft.yaml │ ├── llama3_lora_sft_ds3.yaml │ ├── llama3_lora_sft_ray.yaml │ ├── llama3_preprocess.yaml │ ├── llava1_5_lora_sft.yaml │ ├── qwen2vl_lora_dpo.yaml │ └── qwen2vl_lora_sft.yaml └── train_qlora │ ├── llama3_lora_sft_aqlm.yaml │ ├── llama3_lora_sft_awq.yaml │ ├── llama3_lora_sft_bnb_npu.yaml │ ├── llama3_lora_sft_gptq.yaml │ └── llama3_lora_sft_otfq.yaml ├── pyproject.toml ├── requirements.txt ├── scripts ├── api_example │ ├── test_image.py │ └── test_toolcall.py ├── convert_ckpt │ ├── llamafy_baichuan2.py │ └── llamafy_qwen.py ├── llama_pro.py ├── loftq_init.py ├── pissa_init.py ├── stat_utils │ ├── cal_flops.py │ ├── cal_lr.py │ ├── cal_mfu.py │ ├── cal_ppl.py │ └── length_cdf.py └── vllm_infer.py ├── setup.py ├── src ├── api.py ├── llamafactory │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── app.py │ │ ├── chat.py │ │ ├── common.py │ │ └── protocol.py │ ├── chat │ │ ├── __init__.py │ │ ├── base_engine.py │ │ ├── chat_model.py │ │ ├── hf_engine.py │ │ └── vllm_engine.py │ ├── cli.py │ ├── data │ │ ├── __init__.py │ │ ├── aligner.py │ │ ├── collator.py │ │ ├── data_utils.py │ │ ├── formatter.py │ │ ├── loader.py │ │ ├── mm_plugin.py │ │ ├── parser.py │ │ ├── preprocess.py │ │ ├── processors │ │ │ ├── __init__.py │ │ │ ├── feedback.py │ │ │ ├── pairwise.py │ │ │ ├── pretrain.py │ │ │ ├── processor_utils.py │ │ │ ├── supervised.py │ │ │ └── unsupervised.py │ │ ├── template.py │ │ └── tool_utils.py │ ├── eval │ │ ├── __init__.py │ │ ├── evaluator.py │ │ └── template.py │ ├── extras │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── env.py │ │ ├── logging.py │ │ ├── misc.py │ │ ├── packages.py │ │ └── ploting.py │ ├── hparams │ │ ├── __init__.py │ │ ├── data_args.py │ │ ├── evaluation_args.py │ │ ├── finetuning_args.py │ │ ├── generating_args.py │ │ ├── model_args.py │ │ ├── parser.py │ │ └── training_args.py │ ├── launcher.py │ ├── model │ │ ├── __init__.py │ │ ├── adapter.py │ │ ├── loader.py │ │ ├── model_utils │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── checkpointing.py │ │ │ ├── embedding.py │ │ │ ├── liger_kernel.py │ │ │ ├── longlora.py │ │ │ ├── misc.py │ │ │ ├── mod.py │ │ │ ├── moe.py │ │ │ ├── packing.py │ │ │ ├── quantization.py │ │ │ ├── rope.py │ │ │ ├── unsloth.py │ │ │ ├── valuehead.py │ │ │ └── visual.py │ │ └── patcher.py │ ├── train │ │ ├── __init__.py │ │ ├── callbacks.py │ │ ├── dpo │ │ │ ├── __init__.py │ │ │ ├── trainer.py │ │ │ └── workflow.py │ │ ├── kto │ │ │ ├── __init__.py │ │ │ ├── trainer.py │ │ │ └── workflow.py │ │ ├── ppo │ │ │ ├── __init__.py │ │ │ ├── ppo_utils.py │ │ │ ├── trainer.py │ │ │ └── workflow.py │ │ ├── pt │ │ │ ├── __init__.py │ │ │ ├── trainer.py │ │ │ └── workflow.py │ │ ├── rm │ │ │ ├── __init__.py │ │ │ ├── metric.py │ │ │ ├── trainer.py │ │ │ └── workflow.py │ │ ├── sft │ │ │ ├── __init__.py │ │ │ ├── metric.py │ │ │ ├── trainer.py │ │ │ └── workflow.py │ │ ├── test_utils.py │ │ ├── trainer_utils.py │ │ └── tuner.py │ └── webui │ │ ├── __init__.py │ │ ├── chatter.py │ │ ├── common.py │ │ ├── components │ │ ├── __init__.py │ │ ├── chatbot.py │ │ ├── data.py │ │ ├── eval.py │ │ ├── export.py │ │ ├── infer.py │ │ ├── top.py │ │ └── train.py │ │ ├── control.py │ │ ├── css.py │ │ ├── engine.py │ │ ├── interface.py │ │ ├── locales.py │ │ ├── manager.py │ │ └── runner.py ├── train.py └── webui.py └── tests ├── data ├── processors │ ├── test_feedback.py │ ├── test_pairwise.py │ ├── test_processor_utils.py │ ├── test_supervised.py │ └── test_unsupervised.py ├── test_collator.py ├── test_formatter.py ├── test_mm_plugin.py └── test_template.py ├── e2e ├── test_chat.py └── test_train.py ├── eval └── test_eval_template.py ├── model ├── model_utils │ ├── test_attention.py │ ├── test_checkpointing.py │ ├── test_misc.py │ ├── test_packing.py │ └── test_visual.py ├── test_base.py ├── test_freeze.py ├── test_full.py ├── test_lora.py └── test_pissa.py └── train └── test_sft_trainer.py /eval/eval.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES='0,1,2,3' \ 2 | python eval.py \ 3 | --model_name_or_path "Qwen/Qwen2.5-32B-Instruct" \ 4 | --data_name "math" \ 5 | --prompt_type "qwen-instruct" \ 6 | --temperature 0.0 \ 7 | --start_idx 0 \ 8 | --end_idx -1 \ 9 | --n_sampling 1 \ 10 | --k 1 \ 11 | --split "test" \ 12 | --max_tokens 32768 \ 13 | --seed 0 \ 14 | --top_p 1 \ 15 | --surround_with_messages \ 16 | 17 | -------------------------------------------------------------------------------- /eval/prompt.txt: -------------------------------------------------------------------------------- 1 | ## System Prompt 2 | 3 | You are an experienced examiner who evaluates whether a student's answer to a given question is correct. 4 | Your task is to determine if the student's final answer matches the standard answer provided, based solely on correctness and the question's specific requirements. 5 | Do not perform any additional calculations or reinterpret the question. Simply compare the student's answer to the standard answer to determine if it satisfies the question's requirements. 6 | 7 | Focus strictly on: 8 | 1. Understanding the exact requirement of the question. 9 | 2. Comparing the student's final answer directly to the provided standard answer. 10 | 3. Your task is not to solve the problem but to determine whether the student's answer is correct based on the question's requirements. Avoid any unnecessary analysis, assumptions, or re-solving the problem. 11 | 12 | Note: 13 | - For intervals/ranges: The student's answer must cover the EXACT SAME range as the standard answer, NOT just any single value or subset within that range; 14 | - If the standard answer contains multiple solutions connected by "or"/"and", all of them must be listed in the student's answer; 15 | - You must be deterministic - always declare the answer as either CORRECT or WRONG; 16 | 17 | Your response must include: 18 | ## Analysis 19 | 20 | 21 | ## Correctness 22 | 23 | 24 | 25 | ## User Prompt 26 | 27 | Question: {problem} 28 | 29 | Standard Answer: {standard_answer} 30 | 31 | Student's Final Answer: {model pred's final answer} 32 | -------------------------------------------------------------------------------- /eval/prompts/qwen-instruct/aime.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | system_prompt = "Please reason step by step, and put your final answer within \\boxed{}." 4 | 5 | few_shot_prompt = "" 6 | 7 | question_format = """{question}""" -------------------------------------------------------------------------------- /eval/prompts/qwen-instruct/amc.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | system_prompt = "Please reason step by step, and put your final answer within \\boxed{}." 4 | 5 | few_shot_prompt = "" 6 | 7 | question_format = """{question}""" -------------------------------------------------------------------------------- /eval/prompts/qwen-instruct/gpqa.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | system_prompt = "Please reason step by step, and put your final answer within \\boxed{}." 4 | 5 | few_shot_prompt = "" 6 | 7 | question_format = """{question}""" -------------------------------------------------------------------------------- /eval/prompts/qwen-instruct/math.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | system_prompt = "Please reason step by step, and put your final answer within \\boxed{}." 4 | 5 | few_shot_prompt = "" 6 | 7 | question_format = """{question}""" -------------------------------------------------------------------------------- /eval/prompts/qwen-instruct/minerva.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | system_prompt = "Please reason step by step, and put your final answer within \\boxed{}." 4 | 5 | few_shot_prompt = "" 6 | 7 | question_format = """{question}""" -------------------------------------------------------------------------------- /eval/prompts/qwen-instruct/olympiadbench.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | system_prompt = "Please reason step by step, and put your final answer within \\boxed{}." 4 | 5 | few_shot_prompt = "" 6 | 7 | question_format = """{question}""" -------------------------------------------------------------------------------- /eval/requirements.txt: -------------------------------------------------------------------------------- 1 | # common 2 | vllm<=0.6.1 3 | tqdm 4 | datasets 5 | torch 6 | transformers 7 | python_dateutil 8 | flash_attn 9 | 10 | # math_eval 11 | sympy==1.12 12 | antlr4-python3-runtime==4.11.1 # ! The version needs to be compatible with sympy. 13 | word2number 14 | Pebble 15 | timeout-decorator 16 | latex2sympy2==1.9.1 -------------------------------------------------------------------------------- /eval/utils/__pycache__/grader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/eval/utils/__pycache__/grader.cpython-310.pyc -------------------------------------------------------------------------------- /eval/utils/__pycache__/math_normalization.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/eval/utils/__pycache__/math_normalization.cpython-310.pyc -------------------------------------------------------------------------------- /eval/utils/__pycache__/parser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/eval/utils/__pycache__/parser.cpython-310.pyc -------------------------------------------------------------------------------- /eval/utils/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | from datasets import load_dataset, Dataset, concatenate_datasets 5 | from utils.utils import load_jsonl, lower_keys 6 | 7 | def load_data(data_name, split, data_dir='./data'): 8 | data_file = f"{data_dir}/{data_name}/{split}.jsonl" 9 | if os.path.exists(data_file): 10 | examples = list(load_jsonl(data_file)) 11 | else: 12 | if data_name == "math": 13 | dataset = load_dataset("competition_math", split=split, name="main", cache_dir=f"{data_dir}/temp") 14 | elif data_name == "theorem-qa": 15 | dataset = load_dataset("wenhu/TheoremQA", split=split) 16 | elif data_name == "gsm8k": 17 | dataset = load_dataset(data_name, split=split) 18 | elif data_name == "gsm-hard": 19 | dataset = load_dataset("reasoning-machines/gsm-hard", split="train") 20 | elif data_name == "svamp": 21 | # evaluate on training set + test set 22 | dataset = load_dataset("ChilleD/SVAMP", split="train") 23 | dataset = concatenate_datasets([dataset, load_dataset("ChilleD/SVAMP", split="test")]) 24 | elif data_name == "asdiv": 25 | dataset = load_dataset("EleutherAI/asdiv", split="validation") 26 | dataset = dataset.filter(lambda x: ";" not in x['answer']) # remove multi-answer examples 27 | elif data_name == "mawps": 28 | examples = [] 29 | # four sub-tasks 30 | for data_name in ["singleeq", "singleop", "addsub", "multiarith"]: 31 | sub_examples = list(load_jsonl(f"{data_dir}/mawps/{data_name}.jsonl")) 32 | for example in sub_examples: 33 | example['type'] = data_name 34 | examples.extend(sub_examples) 35 | dataset = Dataset.from_list(examples) 36 | elif data_name == "finqa": 37 | dataset = load_dataset("dreamerdeo/finqa", split=split, name="main") 38 | dataset = dataset.select(random.sample(range(len(dataset)), 1000)) 39 | elif data_name == "tabmwp": 40 | examples = [] 41 | with open(f"{data_dir}/tabmwp/tabmwp_{split}.json", "r") as f: 42 | data_dict = json.load(f) 43 | examples.extend(data_dict.values()) 44 | dataset = Dataset.from_list(examples) 45 | dataset = dataset.select(random.sample(range(len(dataset)), 1000)) 46 | elif data_name == "bbh": 47 | examples = [] 48 | for data_name in ["reasoning_about_colored_objects", "penguins_in_a_table",\ 49 | "date_understanding", "repeat_copy_logic", "object_counting"]: 50 | with open(f"{data_dir}/bbh/bbh/{data_name}.json", "r") as f: 51 | sub_examples = json.load(f)["examples"] 52 | for example in sub_examples: 53 | example['type'] = data_name 54 | examples.extend(sub_examples) 55 | dataset = Dataset.from_list(examples) 56 | else: 57 | raise NotImplementedError(data_name) 58 | 59 | examples = list(dataset) 60 | examples = [lower_keys(example) for example in examples] 61 | dataset = Dataset.from_list(examples) 62 | os.makedirs(f"{data_dir}/{data_name}", exist_ok=True) 63 | dataset.to_json(data_file) 64 | 65 | # add 'idx' in the first column 66 | if 'idx' not in examples[0]: 67 | examples = [{'idx': i, **example} for i, example in enumerate(examples)] 68 | 69 | # dedepulicate & sort 70 | examples = sorted(examples, key=lambda x: x['idx']) 71 | return examples -------------------------------------------------------------------------------- /images/limo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/images/limo.png -------------------------------------------------------------------------------- /train/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | date-released: 2024-03 3 | message: "If you use this software, please cite it as below." 4 | authors: 5 | - family-names: "Zheng" 6 | given-names: "Yaowei" 7 | - family-names: "Zhang" 8 | given-names: "Richong" 9 | - family-names: "Zhang" 10 | given-names: "Junhao" 11 | - family-names: "Ye" 12 | given-names: "Yanhan" 13 | - family-names: "Luo" 14 | given-names: "Zheyan" 15 | - family-names: "Feng" 16 | given-names: "Zhangchi" 17 | - family-names: "Ma" 18 | given-names: "Yongqiang" 19 | title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" 20 | url: "https://arxiv.org/abs/2403.13372" 21 | preferred-citation: 22 | type: conference-paper 23 | conference: 24 | name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)" 25 | authors: 26 | - family-names: "Zheng" 27 | given-names: "Yaowei" 28 | - family-names: "Zhang" 29 | given-names: "Richong" 30 | - family-names: "Zhang" 31 | given-names: "Junhao" 32 | - family-names: "Ye" 33 | given-names: "Yanhan" 34 | - family-names: "Luo" 35 | given-names: "Zheyan" 36 | - family-names: "Feng" 37 | given-names: "Zhangchi" 38 | - family-names: "Ma" 39 | given-names: "Yongqiang" 40 | title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" 41 | url: "https://arxiv.org/abs/2403.13372" 42 | year: 2024 43 | publisher: "Association for Computational Linguistics" 44 | address: "Bangkok, Thailand" 45 | -------------------------------------------------------------------------------- /train/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE requirements.txt 2 | -------------------------------------------------------------------------------- /train/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build commit quality style test 2 | 3 | check_dirs := scripts src tests setup.py 4 | 5 | build: 6 | pip install build && python -m build 7 | 8 | commit: 9 | pre-commit install 10 | pre-commit run --all-files 11 | 12 | quality: 13 | ruff check $(check_dirs) 14 | ruff format --check $(check_dirs) 15 | 16 | style: 17 | ruff check $(check_dirs) --fix 18 | ruff format $(check_dirs) 19 | 20 | test: 21 | CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/ 22 | -------------------------------------------------------------------------------- /train/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/train/assets/logo.png -------------------------------------------------------------------------------- /train/assets/wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/train/assets/wechat.jpg -------------------------------------------------------------------------------- /train/assets/wechat_npu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GAIR-NLP/LIMO/e9951354af219d8c4d454e25ae348395e6598ab0/train/assets/wechat_npu.jpg -------------------------------------------------------------------------------- /train/data/belle_multiturn/belle_multiturn.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import datasets 5 | 6 | 7 | _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") 8 | 9 | _DESCRIPTION = "BELLE multiturn chat dataset." 10 | 11 | _CITATION = """\ 12 | @article{belle2023exploring, 13 | title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases}, 14 | author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li}, 15 | journal={arXiv preprint arXiv:2303.14742}, 16 | year={2023} 17 | } 18 | """ 19 | 20 | _HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M" 21 | _LICENSE = "gpl-3.0" 22 | _URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json" 23 | 24 | 25 | class BelleMultiturn(datasets.GeneratorBasedBuilder): 26 | VERSION = datasets.Version("0.0.0") 27 | 28 | def _info(self): 29 | features = datasets.Features( 30 | {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]} 31 | ) 32 | return datasets.DatasetInfo( 33 | description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION 34 | ) 35 | 36 | def _split_generators(self, dl_manager: datasets.DownloadManager): 37 | file_path = dl_manager.download(_URL) 38 | return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})] 39 | 40 | def _generate_examples(self, filepath: str): 41 | with open(filepath, encoding="utf-8") as f: 42 | for key, row in enumerate(f): 43 | data = json.loads(row) 44 | conversations = [] 45 | prompt = data["instruction"].strip() 46 | response = data["output"].strip() 47 | 48 | assist_idx = prompt.rfind("Assistant:") 49 | human_idx = prompt.rfind("Human:") 50 | query = prompt[human_idx + 6 : assist_idx].strip() 51 | prompt = prompt[:human_idx].strip() 52 | conversations.insert(0, {"from": "gpt", "value": response}) 53 | conversations.insert(0, {"from": "human", "value": query}) 54 | 55 | while prompt.rfind("Assistant:") != -1: 56 | assist_idx = prompt.rfind("Assistant:") 57 | human_idx = prompt.rfind("Human:") 58 | if human_idx != -1: 59 | old_query = prompt[human_idx + 6 : assist_idx].strip() 60 | old_resp = prompt[assist_idx + 10 :].strip() 61 | conversations.insert(0, {"from": "gpt", "value": old_resp}) 62 | conversations.insert(0, {"from": "human", "value": old_query}) 63 | else: 64 | break 65 | prompt = prompt[:human_idx].strip() 66 | 67 | yield key, {"conversations": conversations} 68 | -------------------------------------------------------------------------------- /train/data/mllm_audio_demo.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "messages": [ 4 | { 5 | "content": "