├── data
└── Readme
├── r1-alpaca效果.png
├── r1-studio效果.png
├── deallocaldistil.py
├── parquet2json.py
├── jsonl2json.py
├── demo.py
├── distillationgalpacalocal.py
├── distillationgalpacaremote.py
├── generatefinetune.py
├── README.md
└── finetuneSFTR1.py
/data/Readme:
--------------------------------------------------------------------------------
1 | 此处放入数据集
2 |
--------------------------------------------------------------------------------
/r1-alpaca效果.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/R1-Nature/HEAD/r1-alpaca效果.png
--------------------------------------------------------------------------------
/r1-studio效果.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/R1-Nature/HEAD/r1-studio效果.png
--------------------------------------------------------------------------------
/deallocaldistil.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | # 打开原始的JSON文件
4 | input_file_path = "./data/alpaca_r1_data_zh-local.json" # 替换为你的原始文件路径
5 | output_file_path = "./data/alpaca_r1_data_zh-localpost.json" # 替换为你想要保存的新文件路径
6 |
7 | # 读取JSON文件
8 | with open(input_file_path, "r", encoding="utf-8") as file:
9 | data = json.load(file)
10 |
11 | # 遍历数据并修改"output"字段
12 | for item in data:
13 | if "output" in item:
14 | # 找到\n\n并插入
15 | modified_output = item["output"].replace("\n\n", "\n\n")
16 | # 在内容最后加上
17 | modified_output += ""
18 | item["output"] = modified_output
19 |
20 | # 将修改后的数据保存到新的JSON文件
21 | with open(output_file_path, "w", encoding="utf-8") as file:
22 | json.dump(data, file, ensure_ascii=False, indent=4)
23 |
24 | print("处理完成,新文件已保存到", output_file_path)
--------------------------------------------------------------------------------
/parquet2json.py:
--------------------------------------------------------------------------------
1 | import pyarrow.parquet as pq
2 | import json
3 | import pandas as pd
4 |
5 |
6 | def parquet_to_json(input_file, output_file):
7 | # 读取 Parquet 文件
8 | df = pd.read_parquet(input_file)
9 | # 将 DataFrame 转换为 JSON 格式并保存到文件
10 | if "input" in df.columns:
11 | df.drop(columns=["input"], inplace=True)
12 | if "output" in df.columns:
13 | df["output"] = df["output"].str.replace("", "")
14 | df["output"] = df["output"].str.replace("<\/thought>", "")
15 |
16 | df.to_json(output_file, orient='records', force_ascii=False, indent=4)
17 |
18 | print(f"数据已成功从 {input_file} 转换并保存到 {output_file}")
19 |
20 | # 示例用法
21 | input_file = './data/magpie-reason-train-00000-of-00001.parquet' # 输入的 Parquet 文件路径
22 | output_file = './data/magpie-r1.json' # 输出的 JSON 文件路径
23 | parquet_to_json(input_file, output_file)
--------------------------------------------------------------------------------
/jsonl2json.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | def jsonl_to_json(input_file, output_file):
4 | # 打开输入的.jsonl文件
5 | with open(input_file, 'r', encoding='utf-8') as infile:
6 | # 逐行读取并解析每行的JSON对象
7 | data = [json.loads(line) for line in infile]
8 |
9 |
10 | for item in data:
11 | item['instruction'] = item.pop('prompt')
12 |
13 | item['response'] = item['response'].replace("","")
14 | item['response'] = item['response'].replace("","")
15 | item['response'] = item['response'].replace("","")
17 | #print(item['response'])
18 | item['output'] = item.pop('response')
19 |
20 |
21 | # 将解析后的数据保存为一个JSON文件
22 | with open(output_file, 'w', encoding='utf-8') as outfile:
23 | json.dump(data, outfile, ensure_ascii=False, indent=4)
24 |
25 | print(f"数据已成功从 {input_file} 转换并保存到 {output_file}")
26 |
27 | # 示例用法
28 | input_file = './data/openo1-SFT.jsonl' # 输入的.jsonl文件路径
29 | output_file = './data/openr1-SFT.json' # 输出的JSON文件路径
30 | jsonl_to_json(input_file, output_file)
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 |
3 | model_name = "Qwen2.5-0.5B-Instruct"
4 | model_name = "Qwen2.5-1.5B-Instruct"
5 | model_name = "Qwen2.5-3B-Instruct"
6 | model_name = "Qwen2.5-7B-Instruct-1M"
7 |
8 | # model_name = "Llama-3.2-1B-Instruct"
9 | # model_name = "Llama-3.2-3B-Instruct"
10 | # model_name = "DeepSeek-R1-Distill-Qwen-1.5B"
11 |
12 | # model_name = "DeepSeek-R1-Distill-Qwen-1.5B"
13 | # model_name = "DeepSeek-R1-Distill-Qwen-7B"
14 | # model_name = "DeepSeek-R1-Distill-Llama-8B"
15 |
16 | # model_name = "./model-out/llama3.2-1b-r1"
17 | # model_name = "./model-out/llama3.2-3b-r1"
18 | # model_name = "./model-out/qwen2.5-0.5b-r1"
19 | # model_name = "./model-out/qwen2.5-1.5b-r1"
20 | model_name = "./model-out/deepqwen2.5-1.5b-r1"
21 |
22 | model = AutoModelForCausalLM.from_pretrained(
23 | model_name,
24 | torch_dtype="auto",
25 | device_map="auto"
26 | )
27 | tokenizer = AutoTokenizer.from_pretrained(model_name)
28 |
29 | # prompt = "介绍一下你自己" #对可解释性研究也有巨大帮助,像这种非常简单的,思考内容和输出内容是一致的
30 | # prompt = "写一篇关于DeepNexa R1的说明书"
31 | # prompt = "保持健康的三个提示。"
32 | # prompt = "天上有多少颗星星?"
33 | prompt = "如何证明爱因斯坦质能方程,要求出现数学表达式"
34 | messages = [
35 | {"role": "system", "content": "你是一个由StarRing开发有用的AI助手,名为DeepNexa R1。在回答问题时,要发挥你的思维链,尽量回答。"},
36 | {"role": "user", "content": "你要把这题的内部推理内容放入到...,而将推理的答案放入到...。问题是:"+prompt}
37 | ]
38 | text = tokenizer.apply_chat_template(
39 | messages,
40 | tokenize=False,
41 | add_generation_prompt=True
42 | )
43 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
44 |
45 | generated_ids = model.generate(
46 | **model_inputs,
47 | max_new_tokens=1024
48 | )
49 | generated_ids = [
50 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
51 | ]
52 |
53 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
54 | print(response)
--------------------------------------------------------------------------------
/distillationgalpacalocal.py:
--------------------------------------------------------------------------------
1 | import json
2 | from openai import OpenAI
3 | import time
4 |
5 | def zeng_chat(usrprompt):
6 | client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
7 | chat_completion = client.chat.completions.create(
8 | model='Qwen/2.5',
9 | messages = [
10 | {
11 | "role": "user",
12 | "content": usrprompt
13 | }
14 | ],
15 | )
16 | #print(chat_completion)
17 | answer = chat_completion.choices[0].message.content
18 | return answer
19 |
20 | def modify_json_file(input_file, output_file):
21 | # 读取原始 JSON 文件
22 | with open(input_file, 'r', encoding='utf-8') as infile:
23 | data = json.load(infile)
24 |
25 | # 打开输出文件,准备写入
26 | with open(output_file, 'w', encoding='utf-8') as outfile:
27 | # 写入 JSON 数组的开头
28 | outfile.write("[\n")
29 |
30 | # 遍历数据,逐条处理并写入文件
31 | for i, item in enumerate(data):
32 | # 删除 "input" 字段(如果存在)
33 | if "input" in item:
34 | del item["input"]
35 | # 替换 "output" 字段的内容
36 | if "output" in item:
37 | item["output"] = zeng_chat(usrprompt=item['instruction'])
38 |
39 | # 将当前项写入文件
40 | with open(output_file, 'a+', encoding='utf-8') as outfile:
41 | json.dump(item, outfile, ensure_ascii=False, indent=4)
42 |
43 | print("题项:"+str(i+1))
44 |
45 | # 如果不是最后一项,添加逗号
46 | if i < len(data) - 1:
47 | outfile.write(",\n")
48 | else:
49 | outfile.write("\n") # 最后一项后不加逗号
50 |
51 | # 写入 JSON 数组的结尾
52 | outfile.write("]\n")
53 |
54 | print(f"数据已成功从 {input_file} 修改并保存到 {output_file}")
55 |
56 | # 示例用法
57 | input_file = './data/alpaca_gpt4_data_zh.json' # 输入的原始 JSON 文件路径
58 | output_file = './data/alpaca_r1_data_zh-local.json' # 输出的修改后的 JSON 文件路径
59 | modify_json_file(input_file, output_file)
--------------------------------------------------------------------------------
/distillationgalpacaremote.py:
--------------------------------------------------------------------------------
1 | import json
2 | from openai import OpenAI
3 | import time
4 |
5 | def zeng_chat(usrprompt):
6 | API_SECRET_KEY = "XXX"
7 | BASE_URL = "XXX"
8 |
9 | client = OpenAI(api_key=API_SECRET_KEY, base_url=BASE_URL)
10 | chat_completion = client.chat.completions.create(
11 | model='XXX',
12 | messages = [
13 | {
14 | "role": "user",
15 | "content": usrprompt
16 | }
17 | ],
18 | )
19 | #print(chat_completion)
20 | answer = chat_completion.choices[0].message.content
21 | return answer
22 |
23 | def modify_json_file(input_file, output_file):
24 | # 读取原始 JSON 文件
25 | with open(input_file, 'r', encoding='utf-8') as infile:
26 | data = json.load(infile)
27 |
28 | # 打开输出文件,准备写入
29 | with open(output_file, 'w', encoding='utf-8') as outfile:
30 | # 写入 JSON 数组的开头
31 | outfile.write("[\n")
32 |
33 | # 遍历数据,逐条处理并写入文件
34 | for i, item in enumerate(data):
35 | # 删除 "input" 字段(如果存在)
36 | if "input" in item:
37 | del item["input"]
38 | # 替换 "output" 字段的内容
39 | if "output" in item:
40 | item["output"] = zeng_chat(usrprompt="你需要面对这题的内部推理内容放入到...,把答案放进...。问题:"+item['instruction'])
41 |
42 | # 将当前项写入文件
43 | with open(output_file, 'a+', encoding='utf-8') as outfile:
44 | json.dump(item, outfile, ensure_ascii=False, indent=4)
45 |
46 | print("题项:"+str(i+1))
47 |
48 | # 如果不是最后一项,添加逗号
49 | if i < len(data) - 1:
50 | outfile.write(",\n")
51 | else:
52 | outfile.write("\n") # 最后一项后不加逗号
53 |
54 | # 写入 JSON 数组的结尾
55 | outfile.write("]\n")
56 |
57 | print(f"数据已成功从 {input_file} 修改并保存到 {output_file}")
58 |
59 | # 示例用法
60 | input_file = './data/alpaca_gpt4_data_zh.json' # 输入的原始 JSON 文件路径
61 | output_file = './data/alpaca_r1_data_zh-remote.json' # 输出的修改后的 JSON 文件路径
62 | modify_json_file(input_file, output_file)
--------------------------------------------------------------------------------
/generatefinetune.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from peft import PeftModel
3 | import gradio as gr
4 | from transformers import AutoModelForCausalLM, AutoTokenizer
5 |
6 |
7 |
8 | device = "cuda"
9 | #model_dir = 'Llama-3.2-1B-Instruct'
10 | #model_dir = 'Llama-3.2-3B-Instruct'
11 | #model_dir = 'Qwen2.5-0.5B-Instruct'
12 | #model_dir = 'Qwen2.5-1.5B-Instruct'
13 | model_dir = 'DeepSeek-R1-Distill-Qwen-1.5B'
14 |
15 | model = AutoModelForCausalLM.from_pretrained(
16 | model_dir,
17 | torch_dtype=torch.bfloat16,
18 | device_map="auto"
19 | )
20 |
21 | tokenizer = AutoTokenizer.from_pretrained(model_dir)
22 |
23 | model= PeftModel.from_pretrained(model, "./lora-out")
24 | torch.set_default_tensor_type(torch.cuda.FloatTensor)
25 |
26 |
27 | def generate_prompt(instruction, input=None):
28 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
29 |
30 | ### Instruction:
31 | {instruction}
32 |
33 | ### Response:"""
34 |
35 |
36 | def evaluate(
37 | instruction,
38 | temperature=0.6,
39 | top_p=0.9,
40 | top_k=50,
41 | repetition_penalty=1.2,
42 | max_new_tokens=800
43 | ):
44 | #由于目前模型的强大,其实无论prompt选择常见格式的哪一种,都可以识别
45 | prompt = generate_prompt(instruction, input=None)
46 |
47 | messages = [
48 | {"role": "system", "content": "你是一个有用的人工智能助手。"},
49 | {"role": "user", "content": prompt}
50 | ]
51 |
52 | text = tokenizer.apply_chat_template(
53 | messages,
54 | tokenize=False,
55 | add_generation_prompt=True
56 | )
57 |
58 | model_inputs = tokenizer([text], return_tensors="pt").to(device)
59 |
60 | generated_ids = model.generate(
61 | model_inputs.input_ids,
62 | max_new_tokens=max_new_tokens,
63 | temperature=temperature,
64 | top_p=top_p,
65 | top_k=top_k,
66 | repetition_penalty=repetition_penalty
67 | )
68 |
69 |
70 | generated_ids = [
71 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
72 | ]
73 |
74 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
75 |
76 | return response.strip()
77 |
78 |
79 | gr.Interface(
80 | fn=evaluate,#接口函数
81 | inputs=[
82 | gr.components.Textbox(
83 | lines=2, label="问题", placeholder="给我讲解一道推理题~"
84 | ),
85 | gr.components.Slider(minimum=0.1, maximum=4.0, value=0.6, label="创造力"),
86 | gr.components.Slider(minimum=0.05, maximum=1.0, value=0.9, label="P参数"),
87 | gr.components.Slider(minimum=1, maximum=1000, step=1, value=50, label="K参数"),
88 | gr.components.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.2, label="惩罚参数"),
89 | gr.components.Slider(
90 | minimum=1, maximum=2048, step=1, value=1024, label="上下文长度"
91 | ),
92 | ],
93 | outputs=[
94 | gr.components.Textbox(
95 | lines=15,
96 | label="Output",
97 | )
98 | ],
99 | title="ChatUni",
100 | description="Chat,Your Own World",
101 | ).launch()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # R1-Nature
2 |
3 | 本项目的出发点在于,在小模型上复现R1结果,通过比较性实验展示,包括不同思维推理CoT数据集,与不同尺寸的小模型(0.5B、1B、1.5B和3B),在微调效果上的差异。以相当简单的方式阐释,当前类O1和R1系统中,最重要的影响因素就是think的思维内部推理过程,这一点就是R1本质。同时,项目认为,仍然存在大量未能澄清的细节性问题,如思维链推理爆炸现象和解决方案,需要引起研究者足够重视,而不是简单的蒸馏+RL学习。
4 |
5 | ## 1.项目环境
6 |
7 | Pytorch 2.3.0+ Cuda 12.1
8 |
9 | Peft 0.14.0
10 |
11 | transformers 4.48.2
12 |
13 | LLAMACPP B4646 cuda12.4
14 |
15 | RTX 4090 24G 显卡一张
16 |
17 | 128GB 内存,LMStudio 0.3.9
18 |
19 | ## 2.资源分享
20 |
21 | 数据集:https://huggingface.co/datasets/shareAI/Alpaca-Distill-R1-ZH
22 |
23 | 模型:
24 |
25 | https://huggingface.co/StarRing2022/qwen2.5-0.5b-r1 (由qwen2.5 0.5b蒸馏微调)
26 |
27 | https://huggingface.co/StarRing2022/llama3.2-1b-r1 (由llama3.2 1b蒸馏微调)
28 |
29 | https://huggingface.co/StarRing2022/llama3.2-3b-r1 (由llama3.2 3b蒸馏微调)
30 |
31 | https://huggingface.co/StarRing2022/qwen2.5-1.5b-r1 (由qwen2.5 1.5b蒸馏微调)
32 |
33 | https://huggingface.co/StarRing2022/deepqwen2.5-1.5b-r1 (由deepseek官方发布的qwen2.5 1.5b蒸馏微调)
34 |
35 | https://huggingface.co/StarRing2022/R1-Alpaca-Lora (各实验模型的lora权值)
36 |
37 | GGUF模型(支持端侧、边缘算力设备):
38 |
39 | https://huggingface.co/shareAI/qwen2.5-0.5b-r1-GGUF (FP16)
40 |
41 | https://huggingface.co/shareAI/llama3.2-1b-r1-GGUF (Q4 K_M)
42 |
43 | https://huggingface.co/shareAI/llama3.2-3b-r1-GGUF (Q4 K_M)
44 |
45 | ## 3.数据集介绍
46 |
47 | ringo1-CoT_demo.json,大约0.1K条数据,英文为主,混合中文,原始数据集来源于Marco-o1, https://github.com/AIDC-AI/Marco-o1, 用于微调的尝试,1B模型+单卡4090训练1轮仅需数分钟即可,修改格式标签为...或更贴近于编程含义的...,去掉原有的,发现这种思维性质的标签,会产生很大影响
48 |
49 | openr1-SFT.json,由jsonl转化,大约160K条数据,中英文混杂,原始数据集来源于OpenO1,https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT ,标签格式为...和...,1B模型+单卡4090需要训练1轮约35h时长
50 |
51 | magpie-r1.json,由parquet转化,大约10K条数据,全英文,且思维链内容较长,原始数据集来源于Magpie,https://huggingface.co/datasets/LangAGI-Lab/magpie-reasoning-v1-10k-step-by-step-rationale-alpaca-format , 1B模型+单卡4090需要训练1轮约30min时长
52 |
53 |
54 |
55 | 项目使用的alpaca数据集的基础语料来源于 alpaca_gpt4_data_zh.json, 原50K条数据
56 |
57 | tiny_alpaca_XXX仅有约20条数据,为了初步观察与评估蒸馏目标模型的推理差异
58 |
59 | tiny_alpaca_zh-distill-o3.json 蒸馏O3-MINI,没有等标签,仍为 > Reasoning
60 |
61 | tiny_alpaca_zh-distill-gpt4o.json 蒸馏GPT-4o,使用提示工程获得答案,标签格式为...和...
62 |
63 | tiny_alpaca_zh-distill-local.json 由本地DeepSeek发布的qwen 2.5 7B蒸馏得到,标签格式仅有...,而没有答案标签
64 |
65 |
66 |
67 | 项目主要微调使用的语料,经实测,对于同一模型,由本地DeepSeek发布的qwen 2.5 7B蒸馏得到的数据集,在LOSS表现上,确不及云端如GPT4o得到的数据,loss差值可达到0.4(范围为0.8-1.2)
68 |
69 | alpaca_r1_data_zh-remote.json 来源于云端,纯中文,约alpaca前2.6K条数据,训练的主要测试数据,要注意的是,我们将input内容直接去掉,而没有进行拼接到instrcution,目的是取得mask类似效果,增强模型的猜测能力,1B模型+单卡4090需要训练1轮约8min时长
70 |
71 | alpaca_r1_data_zh-localpost.json 来源本地,纯中文,约alpaca前2K条数据,不一样在于,由DeepSeek蒸馏内容是不带答案标签的,我们发现,如果提示工程去强制标注,是无法像gpt4o那样得到带标注数据,反而会使也混乱,因而Prompt尤其是System Prompt Rule理应存在且很重要。我们的处理,就是使用字符串匹配,带上...标签,用于训练。1B模型+单卡4090需要训练1轮约8min时长
72 |
73 | ## 3.代码结构介绍
74 |
75 | jsonl2json.py 将jsonl转为R1 微调所需要的json格式,标签格式为...和...
76 |
77 | parquet2json.py 将parquet转为R1 微调所需要的json格式,标签格式为...和...
78 |
79 | distillationgalpacalocal.py 使用搭建在LMStudio的本地模型得到蒸馏数据
80 |
81 | deallocaldistil.py 如果本地使用的是DeepSeek R1的模型,则输出数据和所需要的训练数据格式不统一(如前文提及,这里值得探究),后处理为带数据
82 |
83 | distillationgalpacaremote.py 利用Prompt,使用云端大模型得到所需要的蒸馏数据
84 |
85 | finetuneSFTR1.py 使用SFT Lora微调,利用trl库的SFTraniner,默认Lora微调7个模块,CTX为1024,可持续做2-3个阶段
86 |
87 | ```json
88 | "target_modules": [
89 | "q_proj",
90 | "o_proj",
91 | "up_proj",
92 | "k_proj",
93 | "down_proj",
94 | "v_proj",
95 | "gate_proj"
96 | ],
97 | ```
98 |
99 | ```python
100 | output_dir: str = "./lora-out",
101 | batch_size: int = 5,
102 | micro_batch_size: int = 4,
103 | num_epochs: int = 1,
104 | learning_rate: float = 3e-4,
105 | cutoff_len: int = 1024,
106 | val_set_size: int = 1,
107 | lora_r: int = 8,
108 | lora_alpha: int = 16,
109 | lora_dropout: float = 0.05,
110 | ```
111 |
112 | generatefinetune.py 搭建简单的gradio测试微调后的模型
113 |
114 | demo.py 测试分享的模型
115 |
116 | ### 部分对话效果展示:
117 |
118 | ```
119 | 一个很好的问题!爱因斯坦质方程是一种量子力学方程,用于描述粒子和量子系统的行为。让我们一步一步地分析一下。
120 |
121 | 爱因斯坦质方程是由Albert Einstein、Niels Bohr和Erwin Schrödinger在 1924 年首次提出。该方程描述了量子系统中粒子和量子系统的相互作用,包括相互作用的方向和强度。以下是一些爱因斯坦质方程:
122 |
123 | 1. 最初方程 (1):在量子系统中,粒子和量子系统的相互作用是通过电磁场来实现的。该方程描述了粒子和量子系统的相互作用,包括相互作用的方向和强度。
124 |
125 | ∇⋅(E⋅ψ) = -ℏ(∂/∂tψ)
126 |
127 | 其中,E是电荷的力,ψ是粒子的 wave function,ℏ是量子波数,t是时间。
128 |
129 | 2. 最初方程 (2):在量子系统中,粒子和量子系统的相互作用是通过量子力来实现的。该方程描述了粒子和量子系统的相互作用,包括相互作用的方向和强度。
130 |
131 | Hψ = Eψ
132 |
133 | 其中,H是量子力矩阵,ψ是粒子的 wave function,E是粒子的能量。
134 |
135 | 3. 磁性量子力方程 (3):在量子系统中,粒子和量子系统的相互作用是通过磁场来实现的。该方程描述了粒子和量子系统的相互作用,包括相互作用的方向和强度。
136 |
137 | ℋ⋅ψ = -iℏ(∂/∂tψ)
138 |
139 | 其中,ℋ是磁场力矩阵,ψ是粒子的 wave function,i是虚数单位,ℏ是量子波数。
140 |
141 | 这些方程是爱因斯坦质方程的一部分,描述了量子系统中粒子和量子系统的相互作用。这些方程是量子力学中的基本方程,形象地描述了量子系统的相互作用。
142 |
143 | 爱因斯坦质方程是量子力学中的基本方程,形象地描述了量子系统的相互作用。这些方程是量子力学中的基本方程,形象地描述了量子系统的相互作用。
144 | ```
145 | 
146 | 
147 |
148 |
149 | ## 4.思维链推理爆炸现象的产生原因与解决方案
150 |
151 | 我们已经观察到,由于强推理的存在,模型在获得这个能力后,也会相应带来推理速度变慢,更棘手的是,思维链推理的内容有时看似进入死循环,对max_length设为1024,推理内容竟然达到8K字以上,显然,这种推理是失败的,反复纠结,南辕北辙,得不到答案,更得不出正确答案。这种现象,我们称之为 思维链推理爆炸。观察O3、GPT4o等模型,成功的数学解题案例中,推理内容鲜有超过2048,对于思维链推理的内容广度、长度必然应存在控制性策略。
152 |
153 | ### 产生原因(复现):
154 |
155 | 当我们选择待微调的模型为llama 3.2 1B时,数据集选择magie_10k,重复SFT Lora所有模型层 3-5轮,会容易触发这种现象,微调后的模型会一直推理下去,越推理越远。可见,当模型参数较小,数据集思维内容又长但量级不高,训练强度过深,则会发生类似问题。
156 |
157 | ### 可能的解决方案:
158 |
159 | (1)grop step func reward。通过奖励去固定或控制微调的think方向与内容
160 |
161 | (2)冻结LLM部分层训练。我们认为这种方式最简易可行,但未必效果最优。RL强化似乎更多地指向,这种策略适合于对关键层操作,而对“酱油层”放弃。如只微调 q\k模块,训练论数仅1-2次,但这会带来另一个问题,假如数据量不多,微调后的模型输出格式是无法完全保持一致且理想的
162 |
163 | (3)采用之前的智能体prompt技术,去操控思维整体过程
164 |
165 | (4)ZeroCoT随机插值CoT https://github.com/Beortext/RWKV-ZeroCoT ,也是参考性的相关思路,因为问题根本原因仍出在think标签涵盖的内容,被如“酱油层”过分展开
166 |
167 | (5)区分不同阶段的数据集,其特点、量级和质量作提升
168 |
169 | ## 5.结论
170 |
171 | #### 5.1 思考内容Think为核心,其次是整个CoT强推理的结构化,再次则是微调策略、训练配置、训练强度等
172 |
173 | #### 5.2 Prompt的规则设计和外置引导性CoT依然很必要
174 |
175 | #### 5.3 标签是目前最被忽视的事项,但越来越多的实验证明,format形式和think内容有着更复杂的制约关系,还承待我们去发现验证
176 |
177 | 总之,并非如我们看到的,蒸馏高质量数据结合多阶段RL学习,即可以得到近乎完美的产物,很多细节性因素值得业内研究者更多的关注。另外,R1微调主要依靠多阶段强化学习,本项目之所以采用SFT,仍是考虑快速、便捷、易懂地呈现一些崭新的实验成果。
178 |
179 | ## 6.未来发展
180 |
181 | 我们也发现,对于1B和0.5B的模型,确实出现了令人惊喜的推理、回溯分析等能力,即使这些能力的涌现未能帮助模型解题成功,但仍有一定程度上的价值。后续计划在RWKV7 0.1B甚至更小参数的模式,诞生出推理的痕迹。
182 |
--------------------------------------------------------------------------------
/finetuneSFTR1.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from typing import List
4 |
5 | import fire
6 | import torch
7 | import transformers
8 | from datasets import load_dataset
9 |
10 | from peft import (
11 | LoraConfig,
12 | get_peft_model,
13 | get_peft_model_state_dict,
14 | prepare_model_for_kbit_training,
15 | set_peft_model_state_dict,
16 | )
17 | from modelscope import AutoModelForCausalLM, AutoTokenizer
18 |
19 | device = "cuda:0" # the device to load the model onto
20 |
21 | def train(
22 | #base_model: str = "Llama-3.2-1B-Instruct",
23 | #base_model: str = "Llama-3.2-3B-Instruct",
24 | #base_model: str = "Qwen2.5-0.5B-Instruct",
25 | #base_model: str = "Qwen2.5-1.5B-Instruct",
26 | base_model: str = "DeepSeek-R1-Distill-Qwen-1.5B",
27 |
28 | #data_path: str = "./data/ringo1-CoT_demo.json",
29 | #data_path: str = "./data/openr1-SFT.json",
30 | #data_path: str = "./data/magpie-r1.json",
31 | data_path: str = "./data/alpaca_r1_data_zh-remote.json",
32 | #data_path: str = "./data/alpaca_r1_data_zh-localpost.json",
33 |
34 | output_dir: str = "./lora-out",
35 | batch_size: int = 5,
36 | micro_batch_size: int = 4,
37 | num_epochs: int = 1,
38 | learning_rate: float = 3e-4,
39 | cutoff_len: int = 1024,
40 | val_set_size: int = 1,
41 | lora_r: int = 8,
42 | lora_alpha: int = 16,
43 | lora_dropout: float = 0.05,
44 |
45 | # lora_target_modules: List[str] = [
46 | # "q_proj",
47 | # "k_proj",
48 | # ],
49 |
50 | # lora_target_modules: List[str] = [
51 | # "q_proj",
52 | # "k_proj",
53 | # "v_proj",
54 | # "o_proj",
55 | # ],
56 |
57 | lora_target_modules: List[str] = ['q_proj', 'k_proj', 'v_proj', 'o_proj', "gate_proj", "up_proj", "down_proj"],
58 |
59 | train_on_inputs: bool = True,
60 | group_by_length: bool = False,
61 | wandb_project: str = "",
62 | wandb_run_name: str = "",
63 | wandb_watch: str = "",
64 | wandb_log_model: str = "",
65 | resume_from_checkpoint: str = None,
66 | ):
67 | print(
68 | f"Training Alpaca-LoRA model with params:\n"
69 | f"base_model: {base_model}\n"
70 | f"data_path: {data_path}\n"
71 | f"output_dir: {output_dir}\n"
72 | f"batch_size: {batch_size}\n"
73 | f"micro_batch_size: {micro_batch_size}\n"
74 | f"num_epochs: {num_epochs}\n"
75 | f"learning_rate: {learning_rate}\n"
76 | f"cutoff_len: {cutoff_len}\n"
77 | f"val_set_size: {val_set_size}\n"
78 | f"lora_r: {lora_r}\n"
79 | f"lora_alpha: {lora_alpha}\n"
80 | f"lora_dropout: {lora_dropout}\n"
81 | f"lora_target_modules: {lora_target_modules}\n"
82 | f"train_on_inputs: {train_on_inputs}\n"
83 | f"group_by_length: {group_by_length}\n"
84 | f"wandb_project: {wandb_project}\n"
85 | f"wandb_run_name: {wandb_run_name}\n"
86 | f"wandb_watch: {wandb_watch}\n"
87 | f"wandb_log_model: {wandb_log_model}\n"
88 | f"resume_from_checkpoint: {resume_from_checkpoint}\n"
89 | )
90 |
91 | gradient_accumulation_steps = batch_size // micro_batch_size
92 |
93 | use_wandb = len(wandb_project) > 0 or (
94 | "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
95 | )
96 | if len(wandb_project) > 0:
97 | os.environ["WANDB_PROJECT"] = wandb_project
98 | if len(wandb_watch) > 0:
99 | os.environ["WANDB_WATCH"] = wandb_watch
100 | if len(wandb_log_model) > 0:
101 | os.environ["WANDB_LOG_MODEL"] = wandb_log_model
102 |
103 | model = AutoModelForCausalLM.from_pretrained(
104 | base_model,
105 | load_in_8bit=False,
106 | torch_dtype=torch.float16,
107 | device_map=device,
108 | )
109 |
110 | tokenizer = AutoTokenizer.from_pretrained(base_model)
111 |
112 | tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
113 | tokenizer.padding_side = "left" # Allow batched inference
114 |
115 | def tokenize(prompt, add_eos_token=True):
116 | result = tokenizer(
117 | prompt,
118 | truncation=True,
119 | max_length=cutoff_len,
120 | padding=False,
121 | return_tensors=None,
122 | )
123 | if (
124 | result["input_ids"][-1] != tokenizer.eos_token_id
125 | and len(result["input_ids"]) < cutoff_len
126 | and add_eos_token
127 | ):
128 | result["input_ids"].append(tokenizer.eos_token_id)
129 | result["attention_mask"].append(1)
130 | result["labels"] = result["input_ids"].copy()
131 | return result
132 |
133 | def generate_and_tokenize_prompt(data_point):
134 | full_prompt = generate_prompt(data_point)
135 | tokenized_full_prompt = tokenize(full_prompt)
136 | if not train_on_inputs:
137 | user_prompt = generate_prompt({**data_point, "output": ""})
138 | tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
139 | user_prompt_len = len(tokenized_user_prompt["input_ids"])
140 | tokenized_full_prompt["labels"] = [
141 | -100
142 | ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]
143 | return tokenized_full_prompt
144 |
145 | model = prepare_model_for_kbit_training(model)
146 |
147 | config = LoraConfig(
148 | r=lora_r,
149 | lora_alpha=lora_alpha,
150 | target_modules=lora_target_modules,
151 | lora_dropout=lora_dropout,
152 | bias="none",
153 | task_type="CAUSAL_LM",
154 | )
155 | model = get_peft_model(model, config)
156 |
157 | if data_path.endswith(".json"):
158 | data = load_dataset("json", data_files=data_path)
159 | else:
160 | data = load_dataset(data_path)
161 |
162 | if resume_from_checkpoint:
163 | checkpoint_name = os.path.join(
164 | resume_from_checkpoint, "adapter_model.bin"
165 | )
166 | if os.path.exists(checkpoint_name):
167 | print(f"Restarting from {checkpoint_name}")
168 | adapters_weights = torch.load(checkpoint_name)
169 | model = set_peft_model_state_dict(model, adapters_weights)
170 | else:
171 | print(f"Checkpoint {checkpoint_name} not found")
172 |
173 | model.print_trainable_parameters()
174 |
175 | if val_set_size > 0:
176 | train_val = data["train"].train_test_split(
177 | test_size=val_set_size, shuffle=True, seed=42
178 | )
179 | train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
180 | val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
181 | else:
182 | train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
183 | val_data = None
184 |
185 | from trl import SFTTrainer,SFTConfig
186 | trainer = SFTTrainer(
187 | model=model,
188 | train_dataset=train_data,
189 | eval_dataset=val_data,
190 | args=transformers.TrainingArguments( #SFTConfig
191 | per_device_train_batch_size=micro_batch_size,
192 | gradient_accumulation_steps=gradient_accumulation_steps,
193 | warmup_steps=100,
194 | num_train_epochs=num_epochs,
195 | learning_rate=learning_rate,
196 | fp16=True,
197 | logging_steps=10,
198 | optim="adamw_torch",
199 | evaluation_strategy="steps" if val_set_size > 0 else "no",
200 | save_strategy="steps",
201 | eval_steps=200 if val_set_size > 0 else None,
202 | save_steps=200,
203 | output_dir=output_dir,
204 | save_total_limit=3,
205 | load_best_model_at_end=True if val_set_size > 0 else False,
206 | ddp_find_unused_parameters=False,
207 | group_by_length=group_by_length,
208 | report_to="wandb" if use_wandb else None,
209 | run_name=wandb_run_name if use_wandb else None,
210 | ),
211 | data_collator=transformers.DataCollatorForSeq2Seq(
212 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
213 | ),
214 | )
215 | model.config.use_cache = False
216 |
217 | trainer.train(resume_from_checkpoint=resume_from_checkpoint)
218 |
219 | # Save the LoRA adapter
220 | lora_weights = get_peft_model_state_dict(model)
221 | torch.save(lora_weights, os.path.join(output_dir, "adapter_model.bin"))
222 |
223 | config_dict = config.to_dict()
224 | config_dict["target_modules"] = list(config_dict["target_modules"])
225 | with open(os.path.join(output_dir, "adapter_config.json"), "w") as f:
226 | json.dump(config_dict, f, indent=4)
227 |
228 | print("\nTraining complete. LoRA adapter weights and config saved.")
229 |
230 |
231 | def generate_prompt(data_point):
232 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
233 |
234 | ### Instruction:
235 | {data_point["instruction"]}
236 |
237 | ### Response:
238 | {data_point["output"]}"""
239 |
240 | if __name__ == "__main__":
241 | fire.Fire(train)
--------------------------------------------------------------------------------