├── data └── Readme ├── r1-alpaca效果.png ├── r1-studio效果.png ├── deallocaldistil.py ├── parquet2json.py ├── jsonl2json.py ├── demo.py ├── distillationgalpacalocal.py ├── distillationgalpacaremote.py ├── generatefinetune.py ├── README.md └── finetuneSFTR1.py /data/Readme: -------------------------------------------------------------------------------- 1 | 此处放入数据集 2 | -------------------------------------------------------------------------------- /r1-alpaca效果.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StarRing2022/R1-Nature/HEAD/r1-alpaca效果.png -------------------------------------------------------------------------------- /r1-studio效果.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StarRing2022/R1-Nature/HEAD/r1-studio效果.png -------------------------------------------------------------------------------- /deallocaldistil.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # 打开原始的JSON文件 4 | input_file_path = "./data/alpaca_r1_data_zh-local.json" # 替换为你的原始文件路径 5 | output_file_path = "./data/alpaca_r1_data_zh-localpost.json" # 替换为你想要保存的新文件路径 6 | 7 | # 读取JSON文件 8 | with open(input_file_path, "r", encoding="utf-8") as file: 9 | data = json.load(file) 10 | 11 | # 遍历数据并修改"output"字段 12 | for item in data: 13 | if "output" in item: 14 | # 找到\n\n并插入 15 | modified_output = item["output"].replace("\n\n", "\n\n") 16 | # 在内容最后加上 17 | modified_output += "" 18 | item["output"] = modified_output 19 | 20 | # 将修改后的数据保存到新的JSON文件 21 | with open(output_file_path, "w", encoding="utf-8") as file: 22 | json.dump(data, file, ensure_ascii=False, indent=4) 23 | 24 | print("处理完成,新文件已保存到", output_file_path) -------------------------------------------------------------------------------- /parquet2json.py: -------------------------------------------------------------------------------- 1 | import pyarrow.parquet as pq 2 | import json 3 | import pandas as pd 4 | 5 | 6 | def parquet_to_json(input_file, output_file): 7 | # 读取 Parquet 文件 8 | df = pd.read_parquet(input_file) 9 | # 将 DataFrame 转换为 JSON 格式并保存到文件 10 | if "input" in df.columns: 11 | df.drop(columns=["input"], inplace=True) 12 | if "output" in df.columns: 13 | df["output"] = df["output"].str.replace("", "") 14 | df["output"] = df["output"].str.replace("<\/thought>", "") 15 | 16 | df.to_json(output_file, orient='records', force_ascii=False, indent=4) 17 | 18 | print(f"数据已成功从 {input_file} 转换并保存到 {output_file}") 19 | 20 | # 示例用法 21 | input_file = './data/magpie-reason-train-00000-of-00001.parquet' # 输入的 Parquet 文件路径 22 | output_file = './data/magpie-r1.json' # 输出的 JSON 文件路径 23 | parquet_to_json(input_file, output_file) -------------------------------------------------------------------------------- /jsonl2json.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def jsonl_to_json(input_file, output_file): 4 | # 打开输入的.jsonl文件 5 | with open(input_file, 'r', encoding='utf-8') as infile: 6 | # 逐行读取并解析每行的JSON对象 7 | data = [json.loads(line) for line in infile] 8 | 9 | 10 | for item in data: 11 | item['instruction'] = item.pop('prompt') 12 | 13 | item['response'] = item['response'].replace("","") 14 | item['response'] = item['response'].replace("","") 15 | item['response'] = item['response'].replace("","") 16 | item['response'] = item['response'].replace("","") 17 | #print(item['response']) 18 | item['output'] = item.pop('response') 19 | 20 | 21 | # 将解析后的数据保存为一个JSON文件 22 | with open(output_file, 'w', encoding='utf-8') as outfile: 23 | json.dump(data, outfile, ensure_ascii=False, indent=4) 24 | 25 | print(f"数据已成功从 {input_file} 转换并保存到 {output_file}") 26 | 27 | # 示例用法 28 | input_file = './data/openo1-SFT.jsonl' # 输入的.jsonl文件路径 29 | output_file = './data/openr1-SFT.json' # 输出的JSON文件路径 30 | jsonl_to_json(input_file, output_file) -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | model_name = "Qwen2.5-0.5B-Instruct" 4 | model_name = "Qwen2.5-1.5B-Instruct" 5 | model_name = "Qwen2.5-3B-Instruct" 6 | model_name = "Qwen2.5-7B-Instruct-1M" 7 | 8 | # model_name = "Llama-3.2-1B-Instruct" 9 | # model_name = "Llama-3.2-3B-Instruct" 10 | # model_name = "DeepSeek-R1-Distill-Qwen-1.5B" 11 | 12 | # model_name = "DeepSeek-R1-Distill-Qwen-1.5B" 13 | # model_name = "DeepSeek-R1-Distill-Qwen-7B" 14 | # model_name = "DeepSeek-R1-Distill-Llama-8B" 15 | 16 | # model_name = "./model-out/llama3.2-1b-r1" 17 | # model_name = "./model-out/llama3.2-3b-r1" 18 | # model_name = "./model-out/qwen2.5-0.5b-r1" 19 | # model_name = "./model-out/qwen2.5-1.5b-r1" 20 | model_name = "./model-out/deepqwen2.5-1.5b-r1" 21 | 22 | model = AutoModelForCausalLM.from_pretrained( 23 | model_name, 24 | torch_dtype="auto", 25 | device_map="auto" 26 | ) 27 | tokenizer = AutoTokenizer.from_pretrained(model_name) 28 | 29 | # prompt = "介绍一下你自己" #对可解释性研究也有巨大帮助,像这种非常简单的,思考内容和输出内容是一致的 30 | # prompt = "写一篇关于DeepNexa R1的说明书" 31 | # prompt = "保持健康的三个提示。" 32 | # prompt = "天上有多少颗星星?" 33 | prompt = "如何证明爱因斯坦质能方程,要求出现数学表达式" 34 | messages = [ 35 | {"role": "system", "content": "你是一个由StarRing开发有用的AI助手,名为DeepNexa R1。在回答问题时,要发挥你的思维链,尽量回答。"}, 36 | {"role": "user", "content": "你要把这题的内部推理内容放入到...,而将推理的答案放入到...。问题是:"+prompt} 37 | ] 38 | text = tokenizer.apply_chat_template( 39 | messages, 40 | tokenize=False, 41 | add_generation_prompt=True 42 | ) 43 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device) 44 | 45 | generated_ids = model.generate( 46 | **model_inputs, 47 | max_new_tokens=1024 48 | ) 49 | generated_ids = [ 50 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 51 | ] 52 | 53 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 54 | print(response) -------------------------------------------------------------------------------- /distillationgalpacalocal.py: -------------------------------------------------------------------------------- 1 | import json 2 | from openai import OpenAI 3 | import time 4 | 5 | def zeng_chat(usrprompt): 6 | client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio") 7 | chat_completion = client.chat.completions.create( 8 | model='Qwen/2.5', 9 | messages = [ 10 | { 11 | "role": "user", 12 | "content": usrprompt 13 | } 14 | ], 15 | ) 16 | #print(chat_completion) 17 | answer = chat_completion.choices[0].message.content 18 | return answer 19 | 20 | def modify_json_file(input_file, output_file): 21 | # 读取原始 JSON 文件 22 | with open(input_file, 'r', encoding='utf-8') as infile: 23 | data = json.load(infile) 24 | 25 | # 打开输出文件,准备写入 26 | with open(output_file, 'w', encoding='utf-8') as outfile: 27 | # 写入 JSON 数组的开头 28 | outfile.write("[\n") 29 | 30 | # 遍历数据,逐条处理并写入文件 31 | for i, item in enumerate(data): 32 | # 删除 "input" 字段(如果存在) 33 | if "input" in item: 34 | del item["input"] 35 | # 替换 "output" 字段的内容 36 | if "output" in item: 37 | item["output"] = zeng_chat(usrprompt=item['instruction']) 38 | 39 | # 将当前项写入文件 40 | with open(output_file, 'a+', encoding='utf-8') as outfile: 41 | json.dump(item, outfile, ensure_ascii=False, indent=4) 42 | 43 | print("题项:"+str(i+1)) 44 | 45 | # 如果不是最后一项,添加逗号 46 | if i < len(data) - 1: 47 | outfile.write(",\n") 48 | else: 49 | outfile.write("\n") # 最后一项后不加逗号 50 | 51 | # 写入 JSON 数组的结尾 52 | outfile.write("]\n") 53 | 54 | print(f"数据已成功从 {input_file} 修改并保存到 {output_file}") 55 | 56 | # 示例用法 57 | input_file = './data/alpaca_gpt4_data_zh.json' # 输入的原始 JSON 文件路径 58 | output_file = './data/alpaca_r1_data_zh-local.json' # 输出的修改后的 JSON 文件路径 59 | modify_json_file(input_file, output_file) -------------------------------------------------------------------------------- /distillationgalpacaremote.py: -------------------------------------------------------------------------------- 1 | import json 2 | from openai import OpenAI 3 | import time 4 | 5 | def zeng_chat(usrprompt): 6 | API_SECRET_KEY = "XXX" 7 | BASE_URL = "XXX" 8 | 9 | client = OpenAI(api_key=API_SECRET_KEY, base_url=BASE_URL) 10 | chat_completion = client.chat.completions.create( 11 | model='XXX', 12 | messages = [ 13 | { 14 | "role": "user", 15 | "content": usrprompt 16 | } 17 | ], 18 | ) 19 | #print(chat_completion) 20 | answer = chat_completion.choices[0].message.content 21 | return answer 22 | 23 | def modify_json_file(input_file, output_file): 24 | # 读取原始 JSON 文件 25 | with open(input_file, 'r', encoding='utf-8') as infile: 26 | data = json.load(infile) 27 | 28 | # 打开输出文件,准备写入 29 | with open(output_file, 'w', encoding='utf-8') as outfile: 30 | # 写入 JSON 数组的开头 31 | outfile.write("[\n") 32 | 33 | # 遍历数据,逐条处理并写入文件 34 | for i, item in enumerate(data): 35 | # 删除 "input" 字段(如果存在) 36 | if "input" in item: 37 | del item["input"] 38 | # 替换 "output" 字段的内容 39 | if "output" in item: 40 | item["output"] = zeng_chat(usrprompt="你需要面对这题的内部推理内容放入到...,把答案放进...。问题:"+item['instruction']) 41 | 42 | # 将当前项写入文件 43 | with open(output_file, 'a+', encoding='utf-8') as outfile: 44 | json.dump(item, outfile, ensure_ascii=False, indent=4) 45 | 46 | print("题项:"+str(i+1)) 47 | 48 | # 如果不是最后一项,添加逗号 49 | if i < len(data) - 1: 50 | outfile.write(",\n") 51 | else: 52 | outfile.write("\n") # 最后一项后不加逗号 53 | 54 | # 写入 JSON 数组的结尾 55 | outfile.write("]\n") 56 | 57 | print(f"数据已成功从 {input_file} 修改并保存到 {output_file}") 58 | 59 | # 示例用法 60 | input_file = './data/alpaca_gpt4_data_zh.json' # 输入的原始 JSON 文件路径 61 | output_file = './data/alpaca_r1_data_zh-remote.json' # 输出的修改后的 JSON 文件路径 62 | modify_json_file(input_file, output_file) -------------------------------------------------------------------------------- /generatefinetune.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from peft import PeftModel 3 | import gradio as gr 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | 6 | 7 | 8 | device = "cuda" 9 | #model_dir = 'Llama-3.2-1B-Instruct' 10 | #model_dir = 'Llama-3.2-3B-Instruct' 11 | #model_dir = 'Qwen2.5-0.5B-Instruct' 12 | #model_dir = 'Qwen2.5-1.5B-Instruct' 13 | model_dir = 'DeepSeek-R1-Distill-Qwen-1.5B' 14 | 15 | model = AutoModelForCausalLM.from_pretrained( 16 | model_dir, 17 | torch_dtype=torch.bfloat16, 18 | device_map="auto" 19 | ) 20 | 21 | tokenizer = AutoTokenizer.from_pretrained(model_dir) 22 | 23 | model= PeftModel.from_pretrained(model, "./lora-out") 24 | torch.set_default_tensor_type(torch.cuda.FloatTensor) 25 | 26 | 27 | def generate_prompt(instruction, input=None): 28 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. 29 | 30 | ### Instruction: 31 | {instruction} 32 | 33 | ### Response:""" 34 | 35 | 36 | def evaluate( 37 | instruction, 38 | temperature=0.6, 39 | top_p=0.9, 40 | top_k=50, 41 | repetition_penalty=1.2, 42 | max_new_tokens=800 43 | ): 44 | #由于目前模型的强大,其实无论prompt选择常见格式的哪一种,都可以识别 45 | prompt = generate_prompt(instruction, input=None) 46 | 47 | messages = [ 48 | {"role": "system", "content": "你是一个有用的人工智能助手。"}, 49 | {"role": "user", "content": prompt} 50 | ] 51 | 52 | text = tokenizer.apply_chat_template( 53 | messages, 54 | tokenize=False, 55 | add_generation_prompt=True 56 | ) 57 | 58 | model_inputs = tokenizer([text], return_tensors="pt").to(device) 59 | 60 | generated_ids = model.generate( 61 | model_inputs.input_ids, 62 | max_new_tokens=max_new_tokens, 63 | temperature=temperature, 64 | top_p=top_p, 65 | top_k=top_k, 66 | repetition_penalty=repetition_penalty 67 | ) 68 | 69 | 70 | generated_ids = [ 71 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 72 | ] 73 | 74 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 75 | 76 | return response.strip() 77 | 78 | 79 | gr.Interface( 80 | fn=evaluate,#接口函数 81 | inputs=[ 82 | gr.components.Textbox( 83 | lines=2, label="问题", placeholder="给我讲解一道推理题~" 84 | ), 85 | gr.components.Slider(minimum=0.1, maximum=4.0, value=0.6, label="创造力"), 86 | gr.components.Slider(minimum=0.05, maximum=1.0, value=0.9, label="P参数"), 87 | gr.components.Slider(minimum=1, maximum=1000, step=1, value=50, label="K参数"), 88 | gr.components.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.2, label="惩罚参数"), 89 | gr.components.Slider( 90 | minimum=1, maximum=2048, step=1, value=1024, label="上下文长度" 91 | ), 92 | ], 93 | outputs=[ 94 | gr.components.Textbox( 95 | lines=15, 96 | label="Output", 97 | ) 98 | ], 99 | title="ChatUni", 100 | description="Chat,Your Own World", 101 | ).launch() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R1-Nature 2 | 3 | 本项目的出发点在于,在小模型上复现R1结果,通过比较性实验展示,包括不同思维推理CoT数据集,与不同尺寸的小模型(0.5B、1B、1.5B和3B),在微调效果上的差异。以相当简单的方式阐释,当前类O1和R1系统中,最重要的影响因素就是think的思维内部推理过程,这一点就是R1本质。同时,项目认为,仍然存在大量未能澄清的细节性问题,如思维链推理爆炸现象和解决方案,需要引起研究者足够重视,而不是简单的蒸馏+RL学习。 4 | 5 | ## 1.项目环境 6 | 7 | Pytorch 2.3.0+ Cuda 12.1 8 | 9 | Peft 0.14.0 10 | 11 | transformers 4.48.2 12 | 13 | LLAMACPP B4646 cuda12.4 14 | 15 | RTX 4090 24G 显卡一张 16 | 17 | 128GB 内存,LMStudio 0.3.9 18 | 19 | ## 2.资源分享 20 | 21 | 数据集:https://huggingface.co/datasets/shareAI/Alpaca-Distill-R1-ZH 22 | 23 | 模型: 24 | 25 | https://huggingface.co/StarRing2022/qwen2.5-0.5b-r1 (由qwen2.5 0.5b蒸馏微调) 26 | 27 | https://huggingface.co/StarRing2022/llama3.2-1b-r1 (由llama3.2 1b蒸馏微调) 28 | 29 | https://huggingface.co/StarRing2022/llama3.2-3b-r1 (由llama3.2 3b蒸馏微调) 30 | 31 | https://huggingface.co/StarRing2022/qwen2.5-1.5b-r1 (由qwen2.5 1.5b蒸馏微调) 32 | 33 | https://huggingface.co/StarRing2022/deepqwen2.5-1.5b-r1 (由deepseek官方发布的qwen2.5 1.5b蒸馏微调) 34 | 35 | https://huggingface.co/StarRing2022/R1-Alpaca-Lora (各实验模型的lora权值) 36 | 37 | GGUF模型(支持端侧、边缘算力设备): 38 | 39 | https://huggingface.co/shareAI/qwen2.5-0.5b-r1-GGUF (FP16) 40 | 41 | https://huggingface.co/shareAI/llama3.2-1b-r1-GGUF (Q4 K_M) 42 | 43 | https://huggingface.co/shareAI/llama3.2-3b-r1-GGUF (Q4 K_M) 44 | 45 | ## 3.数据集介绍 46 | 47 | ringo1-CoT_demo.json,大约0.1K条数据,英文为主,混合中文,原始数据集来源于Marco-o1, https://github.com/AIDC-AI/Marco-o1, 用于微调的尝试,1B模型+单卡4090训练1轮仅需数分钟即可,修改格式标签为...或更贴近于编程含义的...,去掉原有的...,发现这种思维性质的标签,会产生很大影响 48 | 49 | openr1-SFT.json,由jsonl转化,大约160K条数据,中英文混杂,原始数据集来源于OpenO1,https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT ,标签格式为......,1B模型+单卡4090需要训练1轮约35h时长 50 | 51 | magpie-r1.json,由parquet转化,大约10K条数据,全英文,且思维链内容较长,原始数据集来源于Magpie,https://huggingface.co/datasets/LangAGI-Lab/magpie-reasoning-v1-10k-step-by-step-rationale-alpaca-format , 1B模型+单卡4090需要训练1轮约30min时长 52 | 53 | 54 | 55 | 项目使用的alpaca数据集的基础语料来源于 alpaca_gpt4_data_zh.json, 原50K条数据 56 | 57 | tiny_alpaca_XXX仅有约20条数据,为了初步观察与评估蒸馏目标模型的推理差异 58 | 59 | tiny_alpaca_zh-distill-o3.json 蒸馏O3-MINI,没有等标签,仍为 > Reasoning 60 | 61 | tiny_alpaca_zh-distill-gpt4o.json 蒸馏GPT-4o,使用提示工程获得答案,标签格式为...... 62 | 63 | tiny_alpaca_zh-distill-local.json 由本地DeepSeek发布的qwen 2.5 7B蒸馏得到,标签格式仅有...,而没有答案标签 64 | 65 | 66 | 67 | 项目主要微调使用的语料,经实测,对于同一模型,由本地DeepSeek发布的qwen 2.5 7B蒸馏得到的数据集,在LOSS表现上,确不及云端如GPT4o得到的数据,loss差值可达到0.4(范围为0.8-1.2) 68 | 69 | alpaca_r1_data_zh-remote.json 来源于云端,纯中文,约alpaca前2.6K条数据,训练的主要测试数据,要注意的是,我们将input内容直接去掉,而没有进行拼接到instrcution,目的是取得mask类似效果,增强模型的猜测能力,1B模型+单卡4090需要训练1轮约8min时长 70 | 71 | alpaca_r1_data_zh-localpost.json 来源本地,纯中文,约alpaca前2K条数据,不一样在于,由DeepSeek蒸馏内容是不带答案标签的,我们发现,如果提示工程去强制标注,是无法像gpt4o那样得到带标注数据,反而会使也混乱,因而Prompt尤其是System Prompt Rule理应存在且很重要。我们的处理,就是使用字符串匹配,带上...标签,用于训练。1B模型+单卡4090需要训练1轮约8min时长 72 | 73 | ## 3.代码结构介绍 74 | 75 | jsonl2json.py 将jsonl转为R1 微调所需要的json格式,标签格式为...... 76 | 77 | parquet2json.py 将parquet转为R1 微调所需要的json格式,标签格式为...... 78 | 79 | distillationgalpacalocal.py 使用搭建在LMStudio的本地模型得到蒸馏数据 80 | 81 | deallocaldistil.py 如果本地使用的是DeepSeek R1的模型,则输出数据和所需要的训练数据格式不统一(如前文提及,这里值得探究),后处理为带数据 82 | 83 | distillationgalpacaremote.py 利用Prompt,使用云端大模型得到所需要的蒸馏数据 84 | 85 | finetuneSFTR1.py 使用SFT Lora微调,利用trl库的SFTraniner,默认Lora微调7个模块,CTX为1024,可持续做2-3个阶段 86 | 87 | ```json 88 | "target_modules": [ 89 | "q_proj", 90 | "o_proj", 91 | "up_proj", 92 | "k_proj", 93 | "down_proj", 94 | "v_proj", 95 | "gate_proj" 96 | ], 97 | ``` 98 | 99 | ```python 100 | output_dir: str = "./lora-out", 101 | batch_size: int = 5, 102 | micro_batch_size: int = 4, 103 | num_epochs: int = 1, 104 | learning_rate: float = 3e-4, 105 | cutoff_len: int = 1024, 106 | val_set_size: int = 1, 107 | lora_r: int = 8, 108 | lora_alpha: int = 16, 109 | lora_dropout: float = 0.05, 110 | ``` 111 | 112 | generatefinetune.py 搭建简单的gradio测试微调后的模型 113 | 114 | demo.py 测试分享的模型 115 | 116 | ### 部分对话效果展示: 117 | 118 | ``` 119 | 一个很好的问题!爱因斯坦质方程是一种量子力学方程,用于描述粒子和量子系统的行为。让我们一步一步地分析一下。 120 | 121 | 爱因斯坦质方程是由Albert Einstein、Niels Bohr和Erwin Schrödinger在 1924 年首次提出。该方程描述了量子系统中粒子和量子系统的相互作用,包括相互作用的方向和强度。以下是一些爱因斯坦质方程: 122 | 123 | 1. 最初方程 (1):在量子系统中,粒子和量子系统的相互作用是通过电磁场来实现的。该方程描述了粒子和量子系统的相互作用,包括相互作用的方向和强度。 124 | 125 | ∇⋅(E⋅ψ) = -ℏ(∂/∂tψ) 126 | 127 | 其中,E是电荷的力,ψ是粒子的 wave function,ℏ是量子波数,t是时间。 128 | 129 | 2. 最初方程 (2):在量子系统中,粒子和量子系统的相互作用是通过量子力来实现的。该方程描述了粒子和量子系统的相互作用,包括相互作用的方向和强度。 130 | 131 | Hψ = Eψ 132 | 133 | 其中,H是量子力矩阵,ψ是粒子的 wave function,E是粒子的能量。 134 | 135 | 3. 磁性量子力方程 (3):在量子系统中,粒子和量子系统的相互作用是通过磁场来实现的。该方程描述了粒子和量子系统的相互作用,包括相互作用的方向和强度。 136 | 137 | ℋ⋅ψ = -iℏ(∂/∂tψ) 138 | 139 | 其中,ℋ是磁场力矩阵,ψ是粒子的 wave function,i是虚数单位,ℏ是量子波数。 140 | 141 | 这些方程是爱因斯坦质方程的一部分,描述了量子系统中粒子和量子系统的相互作用。这些方程是量子力学中的基本方程,形象地描述了量子系统的相互作用。 142 | 143 | 爱因斯坦质方程是量子力学中的基本方程,形象地描述了量子系统的相互作用。这些方程是量子力学中的基本方程,形象地描述了量子系统的相互作用。 144 | ``` 145 | ![r1-alpaca效果](r1-alpaca效果.png) 146 | ![r1-studio效果](r1-studio效果.png) 147 | 148 | 149 | ## 4.思维链推理爆炸现象的产生原因与解决方案 150 | 151 | 我们已经观察到,由于强推理的存在,模型在获得这个能力后,也会相应带来推理速度变慢,更棘手的是,思维链推理的内容有时看似进入死循环,对max_length设为1024,推理内容竟然达到8K字以上,显然,这种推理是失败的,反复纠结,南辕北辙,得不到答案,更得不出正确答案。这种现象,我们称之为 思维链推理爆炸。观察O3、GPT4o等模型,成功的数学解题案例中,推理内容鲜有超过2048,对于思维链推理的内容广度、长度必然应存在控制性策略。 152 | 153 | ### 产生原因(复现): 154 | 155 | 当我们选择待微调的模型为llama 3.2 1B时,数据集选择magie_10k,重复SFT Lora所有模型层 3-5轮,会容易触发这种现象,微调后的模型会一直推理下去,越推理越远。可见,当模型参数较小,数据集思维内容又长但量级不高,训练强度过深,则会发生类似问题。 156 | 157 | ### 可能的解决方案: 158 | 159 | (1)grop step func reward。通过奖励去固定或控制微调的think方向与内容 160 | 161 | (2)冻结LLM部分层训练。我们认为这种方式最简易可行,但未必效果最优。RL强化似乎更多地指向,这种策略适合于对关键层操作,而对“酱油层”放弃。如只微调 q\k模块,训练论数仅1-2次,但这会带来另一个问题,假如数据量不多,微调后的模型输出格式是无法完全保持一致且理想的 162 | 163 | (3)采用之前的智能体prompt技术,去操控思维整体过程 164 | 165 | (4)ZeroCoT随机插值CoT https://github.com/Beortext/RWKV-ZeroCoT ,也是参考性的相关思路,因为问题根本原因仍出在think标签涵盖的内容,被如“酱油层”过分展开 166 | 167 | (5)区分不同阶段的数据集,其特点、量级和质量作提升 168 | 169 | ## 5.结论 170 | 171 | #### 5.1 思考内容Think为核心,其次是整个CoT强推理的结构化,再次则是微调策略、训练配置、训练强度等 172 | 173 | #### 5.2 Prompt的规则设计和外置引导性CoT依然很必要 174 | 175 | #### 5.3 标签是目前最被忽视的事项,但越来越多的实验证明,format形式和think内容有着更复杂的制约关系,还承待我们去发现验证 176 | 177 | 总之,并非如我们看到的,蒸馏高质量数据结合多阶段RL学习,即可以得到近乎完美的产物,很多细节性因素值得业内研究者更多的关注。另外,R1微调主要依靠多阶段强化学习,本项目之所以采用SFT,仍是考虑快速、便捷、易懂地呈现一些崭新的实验成果。 178 | 179 | ## 6.未来发展 180 | 181 | 我们也发现,对于1B和0.5B的模型,确实出现了令人惊喜的推理、回溯分析等能力,即使这些能力的涌现未能帮助模型解题成功,但仍有一定程度上的价值。后续计划在RWKV7 0.1B甚至更小参数的模式,诞生出推理的痕迹。 182 | -------------------------------------------------------------------------------- /finetuneSFTR1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from typing import List 4 | 5 | import fire 6 | import torch 7 | import transformers 8 | from datasets import load_dataset 9 | 10 | from peft import ( 11 | LoraConfig, 12 | get_peft_model, 13 | get_peft_model_state_dict, 14 | prepare_model_for_kbit_training, 15 | set_peft_model_state_dict, 16 | ) 17 | from modelscope import AutoModelForCausalLM, AutoTokenizer 18 | 19 | device = "cuda:0" # the device to load the model onto 20 | 21 | def train( 22 | #base_model: str = "Llama-3.2-1B-Instruct", 23 | #base_model: str = "Llama-3.2-3B-Instruct", 24 | #base_model: str = "Qwen2.5-0.5B-Instruct", 25 | #base_model: str = "Qwen2.5-1.5B-Instruct", 26 | base_model: str = "DeepSeek-R1-Distill-Qwen-1.5B", 27 | 28 | #data_path: str = "./data/ringo1-CoT_demo.json", 29 | #data_path: str = "./data/openr1-SFT.json", 30 | #data_path: str = "./data/magpie-r1.json", 31 | data_path: str = "./data/alpaca_r1_data_zh-remote.json", 32 | #data_path: str = "./data/alpaca_r1_data_zh-localpost.json", 33 | 34 | output_dir: str = "./lora-out", 35 | batch_size: int = 5, 36 | micro_batch_size: int = 4, 37 | num_epochs: int = 1, 38 | learning_rate: float = 3e-4, 39 | cutoff_len: int = 1024, 40 | val_set_size: int = 1, 41 | lora_r: int = 8, 42 | lora_alpha: int = 16, 43 | lora_dropout: float = 0.05, 44 | 45 | # lora_target_modules: List[str] = [ 46 | # "q_proj", 47 | # "k_proj", 48 | # ], 49 | 50 | # lora_target_modules: List[str] = [ 51 | # "q_proj", 52 | # "k_proj", 53 | # "v_proj", 54 | # "o_proj", 55 | # ], 56 | 57 | lora_target_modules: List[str] = ['q_proj', 'k_proj', 'v_proj', 'o_proj', "gate_proj", "up_proj", "down_proj"], 58 | 59 | train_on_inputs: bool = True, 60 | group_by_length: bool = False, 61 | wandb_project: str = "", 62 | wandb_run_name: str = "", 63 | wandb_watch: str = "", 64 | wandb_log_model: str = "", 65 | resume_from_checkpoint: str = None, 66 | ): 67 | print( 68 | f"Training Alpaca-LoRA model with params:\n" 69 | f"base_model: {base_model}\n" 70 | f"data_path: {data_path}\n" 71 | f"output_dir: {output_dir}\n" 72 | f"batch_size: {batch_size}\n" 73 | f"micro_batch_size: {micro_batch_size}\n" 74 | f"num_epochs: {num_epochs}\n" 75 | f"learning_rate: {learning_rate}\n" 76 | f"cutoff_len: {cutoff_len}\n" 77 | f"val_set_size: {val_set_size}\n" 78 | f"lora_r: {lora_r}\n" 79 | f"lora_alpha: {lora_alpha}\n" 80 | f"lora_dropout: {lora_dropout}\n" 81 | f"lora_target_modules: {lora_target_modules}\n" 82 | f"train_on_inputs: {train_on_inputs}\n" 83 | f"group_by_length: {group_by_length}\n" 84 | f"wandb_project: {wandb_project}\n" 85 | f"wandb_run_name: {wandb_run_name}\n" 86 | f"wandb_watch: {wandb_watch}\n" 87 | f"wandb_log_model: {wandb_log_model}\n" 88 | f"resume_from_checkpoint: {resume_from_checkpoint}\n" 89 | ) 90 | 91 | gradient_accumulation_steps = batch_size // micro_batch_size 92 | 93 | use_wandb = len(wandb_project) > 0 or ( 94 | "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0 95 | ) 96 | if len(wandb_project) > 0: 97 | os.environ["WANDB_PROJECT"] = wandb_project 98 | if len(wandb_watch) > 0: 99 | os.environ["WANDB_WATCH"] = wandb_watch 100 | if len(wandb_log_model) > 0: 101 | os.environ["WANDB_LOG_MODEL"] = wandb_log_model 102 | 103 | model = AutoModelForCausalLM.from_pretrained( 104 | base_model, 105 | load_in_8bit=False, 106 | torch_dtype=torch.float16, 107 | device_map=device, 108 | ) 109 | 110 | tokenizer = AutoTokenizer.from_pretrained(base_model) 111 | 112 | tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token 113 | tokenizer.padding_side = "left" # Allow batched inference 114 | 115 | def tokenize(prompt, add_eos_token=True): 116 | result = tokenizer( 117 | prompt, 118 | truncation=True, 119 | max_length=cutoff_len, 120 | padding=False, 121 | return_tensors=None, 122 | ) 123 | if ( 124 | result["input_ids"][-1] != tokenizer.eos_token_id 125 | and len(result["input_ids"]) < cutoff_len 126 | and add_eos_token 127 | ): 128 | result["input_ids"].append(tokenizer.eos_token_id) 129 | result["attention_mask"].append(1) 130 | result["labels"] = result["input_ids"].copy() 131 | return result 132 | 133 | def generate_and_tokenize_prompt(data_point): 134 | full_prompt = generate_prompt(data_point) 135 | tokenized_full_prompt = tokenize(full_prompt) 136 | if not train_on_inputs: 137 | user_prompt = generate_prompt({**data_point, "output": ""}) 138 | tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False) 139 | user_prompt_len = len(tokenized_user_prompt["input_ids"]) 140 | tokenized_full_prompt["labels"] = [ 141 | -100 142 | ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:] 143 | return tokenized_full_prompt 144 | 145 | model = prepare_model_for_kbit_training(model) 146 | 147 | config = LoraConfig( 148 | r=lora_r, 149 | lora_alpha=lora_alpha, 150 | target_modules=lora_target_modules, 151 | lora_dropout=lora_dropout, 152 | bias="none", 153 | task_type="CAUSAL_LM", 154 | ) 155 | model = get_peft_model(model, config) 156 | 157 | if data_path.endswith(".json"): 158 | data = load_dataset("json", data_files=data_path) 159 | else: 160 | data = load_dataset(data_path) 161 | 162 | if resume_from_checkpoint: 163 | checkpoint_name = os.path.join( 164 | resume_from_checkpoint, "adapter_model.bin" 165 | ) 166 | if os.path.exists(checkpoint_name): 167 | print(f"Restarting from {checkpoint_name}") 168 | adapters_weights = torch.load(checkpoint_name) 169 | model = set_peft_model_state_dict(model, adapters_weights) 170 | else: 171 | print(f"Checkpoint {checkpoint_name} not found") 172 | 173 | model.print_trainable_parameters() 174 | 175 | if val_set_size > 0: 176 | train_val = data["train"].train_test_split( 177 | test_size=val_set_size, shuffle=True, seed=42 178 | ) 179 | train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt) 180 | val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt) 181 | else: 182 | train_data = data["train"].shuffle().map(generate_and_tokenize_prompt) 183 | val_data = None 184 | 185 | from trl import SFTTrainer,SFTConfig 186 | trainer = SFTTrainer( 187 | model=model, 188 | train_dataset=train_data, 189 | eval_dataset=val_data, 190 | args=transformers.TrainingArguments( #SFTConfig 191 | per_device_train_batch_size=micro_batch_size, 192 | gradient_accumulation_steps=gradient_accumulation_steps, 193 | warmup_steps=100, 194 | num_train_epochs=num_epochs, 195 | learning_rate=learning_rate, 196 | fp16=True, 197 | logging_steps=10, 198 | optim="adamw_torch", 199 | evaluation_strategy="steps" if val_set_size > 0 else "no", 200 | save_strategy="steps", 201 | eval_steps=200 if val_set_size > 0 else None, 202 | save_steps=200, 203 | output_dir=output_dir, 204 | save_total_limit=3, 205 | load_best_model_at_end=True if val_set_size > 0 else False, 206 | ddp_find_unused_parameters=False, 207 | group_by_length=group_by_length, 208 | report_to="wandb" if use_wandb else None, 209 | run_name=wandb_run_name if use_wandb else None, 210 | ), 211 | data_collator=transformers.DataCollatorForSeq2Seq( 212 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 213 | ), 214 | ) 215 | model.config.use_cache = False 216 | 217 | trainer.train(resume_from_checkpoint=resume_from_checkpoint) 218 | 219 | # Save the LoRA adapter 220 | lora_weights = get_peft_model_state_dict(model) 221 | torch.save(lora_weights, os.path.join(output_dir, "adapter_model.bin")) 222 | 223 | config_dict = config.to_dict() 224 | config_dict["target_modules"] = list(config_dict["target_modules"]) 225 | with open(os.path.join(output_dir, "adapter_config.json"), "w") as f: 226 | json.dump(config_dict, f, indent=4) 227 | 228 | print("\nTraining complete. LoRA adapter weights and config saved.") 229 | 230 | 231 | def generate_prompt(data_point): 232 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. 233 | 234 | ### Instruction: 235 | {data_point["instruction"]} 236 | 237 | ### Response: 238 | {data_point["output"]}""" 239 | 240 | if __name__ == "__main__": 241 | fire.Fire(train) --------------------------------------------------------------------------------