├── README.md
├── README_EN.md
├── assets
├── Llama4-Maverick.png
├── base_eval.png
├── ceval.jpg
├── llama.jpg
├── llama.png
├── llama2-chinese-webui.jpg
├── llama3_eval.png
├── llama_eval.jpeg
├── meta_eval_13B.md
├── meta_eval_7B.md
├── tuned_eval.png
├── wechat-new.jpeg
└── wechat.jpeg
├── data
├── dev_sft.csv
├── dev_sft_sharegpt.csv
└── train_sft.csv
├── docker
├── Dockerfile
├── Dockerfile_train
└── docker-compose.yml
├── docs
├── chat_gradio_guide.md
└── inference_speed_guide.md
├── examples
├── chat_gradio.py
├── chat_gradio_no_merge.py
└── llama2_for_langchain.py
├── inference-speed
├── CPU
│ └── ggml
│ │ └── README.md
└── GPU
│ ├── FasterTransformer_example
│ └── README.md
│ ├── JittorLLMs_example
│ └── README.md
│ ├── TensorRT-LLM_example
│ ├── README.md
│ ├── atom_inference.py
│ └── utils.py
│ ├── lmdeploy_example
│ ├── README.md
│ └── test_api_server.py
│ └── vllm_example
│ ├── README.md
│ ├── api_server.py
│ ├── client_test.py
│ ├── multi_gpus_api_server.sh
│ └── single_gpu_api_server.sh
├── requirements.txt
├── scripts
├── api
│ ├── README.md
│ ├── accelerate_client.py
│ └── accelerate_server.py
├── convert2hf
│ ├── README.md
│ └── convert_llama_weights_to_hf.py
└── test_model
│ └── test_pretrain_model.ipynb
└── train
├── merge_peft_model
├── merge.sh
├── merge_muilt.sh
├── merge_muilt_peft_adapter.py
└── merge_peft_adapter.py
├── pretrain
├── accuracy.py
├── ds_config_zero2.json
├── ds_config_zero3.json
├── pretrain.sh
└── pretrain_clm.py
└── sft
├── accuracy.py
├── ds_config_zero2.json
├── finetune.sh
├── finetune_clm.py
├── finetune_clm_lora.py
└── finetune_lora.sh
/assets/Llama4-Maverick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/Llama4-Maverick.png
--------------------------------------------------------------------------------
/assets/base_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/base_eval.png
--------------------------------------------------------------------------------
/assets/ceval.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/ceval.jpg
--------------------------------------------------------------------------------
/assets/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama.jpg
--------------------------------------------------------------------------------
/assets/llama.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama.png
--------------------------------------------------------------------------------
/assets/llama2-chinese-webui.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama2-chinese-webui.jpg
--------------------------------------------------------------------------------
/assets/llama3_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama3_eval.png
--------------------------------------------------------------------------------
/assets/llama_eval.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama_eval.jpeg
--------------------------------------------------------------------------------
/assets/tuned_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/tuned_eval.png
--------------------------------------------------------------------------------
/assets/wechat-new.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/wechat-new.jpeg
--------------------------------------------------------------------------------
/assets/wechat.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/wechat.jpeg
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # 使用pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel作为基础镜像
2 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel
3 |
4 | RUN apt-get update -y --allow-unauthenticated
5 | RUN apt install -y git vim git-lfs
6 |
7 | #设置工作目录
8 | WORKDIR /root/Llama-Chinese
9 |
10 | # 从git上克隆llama-chinese仓库
11 | RUN git clone https://github.com/LlamaFamily/Llama-Chinese.git /root/Llama-Chinese
12 |
13 | # tsinghua source
14 | RUN mkdir -p ~/.pip
15 | RUN echo "[global]\nindex-url = https://pypi.tuna.tsinghua.edu.cn/simple" > ~/.pip/pip.conf
16 |
17 | # 使用pip安装requirements.txt
18 | RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn -r requirements.txt
19 |
20 | #克隆Hugging Face仓库
21 | RUN git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat
22 |
23 | #开启7860端口
24 | EXPOSE 7860
25 |
26 | #设置启动命令
27 | ENTRYPOINT ["python", "examples/chat_gradio.py", "--model_name_or_path", "/root/Llama-Chinese/Atom-7B-Chat/"]
28 |
--------------------------------------------------------------------------------
/docker/Dockerfile_train:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel as builder
2 | RUN apt-get update -y --allow-unauthenticated
3 | RUN apt install git tmux htop vim -y
4 | RUN pip install bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
5 | RUN pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
6 | RUN pip install peft -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
7 | RUN pip install accelerate -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
8 | RUN pip install deepspeed -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
9 | RUN pip install scipy sentencepiece datasets joblib sentence_transformers cn2an evaluate tensorboard wandb -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | services:
3 | app:
4 | image: flagalpha/llama2-chinese:gradio # 这里替换为你实际的镜像名
5 | volumes:
6 | - /usr/local/nvidia:/usr/local/nvidia # 让容器访问主机的NVIDIA驱动
7 | environment:
8 | - NVIDIA_VISIBLE_DEVICES=all # 让容器可以访问所有的NVIDIA GPU
9 | ports:
10 | - 7860:7860 # 在容器和主机之间映射端口
11 | deploy:
12 | resources:
13 | reservations:
14 | devices:
15 | - driver: nvidia
16 | capabilities: [gpu] # 使用Docker的设备请求来让容器使用GPU
17 |
--------------------------------------------------------------------------------
/docs/chat_gradio_guide.md:
--------------------------------------------------------------------------------
1 | # Docker环境执行chat_gradio.py
2 |
3 | 系统需要准备的环境
4 |
5 | + docker: 24.0.2
6 | + docker-compose
7 |
8 | ## 第一步. 准备Docker镜像
9 |
10 | 通过docker镜像可以更方便的管理需要安装的环境依赖。所以这里可以直接通过docker容器启动[chat_gradio](../examples/chat_gradio.py), 第一步准备镜像环境。
11 |
12 | ```bash
13 | git clone https://github.com/LlamaFamily/Llama-Chinese.git
14 |
15 | cd Llama-Chinese
16 |
17 | docker build -f docker/Dockerfile -t FlagAlpha/llama2-chinese:gradio .
18 | ```
19 |
20 | ## 第二步. 通过docker-compose启动chat_gradio
21 |
22 |
23 | ```bash
24 | cd Llama-Chinese/docker
25 | doker-compose up -d --build
26 | ```
--------------------------------------------------------------------------------
/docs/inference_speed_guide.md:
--------------------------------------------------------------------------------
1 | # 推理部署
2 |
3 | > 训练完之后或者经过微调之后的模型或者直接从[huggingface](https://huggingface.co/FlagAlpha)下载的模型,都需要部署使用。部署也就是指的模型推理,如果直接使用原生的trainsfomers进行部署,速度会比较慢。针对推理有多种加速手段,会带来较快的推理速度。
4 |
5 |
6 |
7 | ## 1. GPU推理方案
8 |
9 | ### 方案一:vllm
10 |
11 | [使用说明](../inference-speed/GPU/vllm_example/README.md)
12 |
13 | ### 方案二:TensorRT-LLM
14 |
15 | [使用说明](../inference-speed/GPU/TensorRT-LLM_example/README.md)
16 |
17 |
18 | ## 2. CPU 推理方案
19 |
20 | ### 方案一:ggml
21 | [使用说明](../inference-speed/CPU/ggml/README.md)
22 |
--------------------------------------------------------------------------------
/examples/chat_gradio.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | import time
3 | from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer
4 | from threading import Thread
5 | import torch,sys,os
6 | import json
7 | import pandas
8 | import argparse
9 |
10 | with gr.Blocks() as demo:
11 | gr.Markdown("""
智能助手
""")
12 | chatbot = gr.Chatbot()
13 | msg = gr.Textbox()
14 | state = gr.State()
15 | with gr.Row():
16 | clear = gr.Button("新话题")
17 | re_generate = gr.Button("重新回答")
18 | sent_bt = gr.Button("发送")
19 | with gr.Accordion("生成参数", open=False):
20 | slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3)
21 | slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95)
22 | slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0)
23 | def user(user_message, history):
24 | return "", history + [[user_message, None]]
25 | def bot(history,temperature,top_p,slider_context_times):
26 | if pandas.isnull(history[-1][1])==False:
27 | history[-1][1] = None
28 | yield history
29 | slider_context_times = int(slider_context_times)
30 | history_true = history[1:-1]
31 | prompt = ''
32 | if slider_context_times>0:
33 | prompt += '\n'.join([("Human: "+one_chat[0].replace('
','\n')+'\n' if one_chat[0] else '') +"Assistant: "+one_chat[1].replace('
','\n')+'\n' for one_chat in history_true[-slider_context_times:] ])
34 | prompt += "Human: "+history[-1][0].replace('
','\n')+"\nAssistant:"
35 | input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda')
36 | generate_input = {
37 | "input_ids":input_ids,
38 | "max_new_tokens":512,
39 | "do_sample":True,
40 | "top_k":50,
41 | "top_p":top_p,
42 | "temperature":temperature,
43 | "repetition_penalty":1.3,
44 | "streamer":streamer,
45 | "eos_token_id":tokenizer.eos_token_id,
46 | "bos_token_id":tokenizer.bos_token_id,
47 | "pad_token_id":tokenizer.pad_token_id
48 | }
49 | thread = Thread(target=model.generate, kwargs=generate_input)
50 | thread.start()
51 | start_time = time.time()
52 | bot_message =''
53 | print('Human:',history[-1][0])
54 | print('Assistant: ',end='',flush=True)
55 | for new_text in streamer:
56 | print(new_text,end='',flush=True)
57 | if len(new_text)==0:
58 | continue
59 | if new_text!='':
60 | bot_message+=new_text
61 | if 'Human:' in bot_message:
62 | bot_message = bot_message.split('Human:')[0]
63 | history[-1][1] = bot_message
64 | yield history
65 | end_time =time.time()
66 | print()
67 | print('生成耗时:',end_time-start_time,'文字长度:',len(bot_message),'字耗时:',(end_time-start_time)/len(bot_message))
68 |
69 | msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
70 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
71 | )
72 | sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
73 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
74 | )
75 | re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot )
76 | clear.click(lambda: [], None, chatbot, queue=False)
77 |
78 | if __name__ == "__main__":
79 | parser = argparse.ArgumentParser()
80 | parser.add_argument("--model_name_or_path", type=str, help='mode name or path')
81 | parser.add_argument("--is_4bit", action='store_true', help='use 4bit model')
82 | args = parser.parse_args()
83 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False)
84 | tokenizer.pad_token = tokenizer.eos_token
85 | if args.is_4bit==False:
86 | model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,
87 | device_map='cuda:0' if torch.cuda.is_available() else "auto",
88 | torch_dtype=torch.float16,
89 | load_in_8bit=True,
90 | trust_remote_code=True,
91 | use_flash_attention_2=True)
92 | model.eval()
93 | else:
94 | from auto_gptq import AutoGPTQForCausalLM
95 | model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
96 | streamer = TextIteratorStreamer(tokenizer,skip_prompt=True)
97 | if torch.__version__ >= "2" and sys.platform != "win32":
98 | model = torch.compile(model)
99 | demo.queue().launch(share=False, debug=True,server_name="0.0.0.0")
100 |
--------------------------------------------------------------------------------
/examples/chat_gradio_no_merge.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | import time
3 | from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer
4 | from threading import Thread
5 | from peft import PeftModel,PeftConfig
6 | import torch,sys,os
7 | import json
8 | import pandas
9 | import argparse
10 |
11 | with gr.Blocks() as demo:
12 | gr.Markdown("""智能助手
""")
13 | chatbot = gr.Chatbot()
14 | msg = gr.Textbox()
15 | state = gr.State()
16 | with gr.Row():
17 | clear = gr.Button("新话题")
18 | re_generate = gr.Button("重新回答")
19 | sent_bt = gr.Button("发送")
20 | with gr.Accordion("生成参数", open=False):
21 | slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3)
22 | slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95)
23 | slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0)
24 | def user(user_message, history):
25 | return "", history + [[user_message, None]]
26 | def bot(history,temperature,top_p,slider_context_times):
27 | if pandas.isnull(history[-1][1])==False:
28 | history[-1][1] = None
29 | yield history
30 | slider_context_times = int(slider_context_times)
31 | history_true = history[1:-1]
32 | prompt = ''
33 | if slider_context_times>0:
34 | prompt += '\n'.join([("Human: "+one_chat[0].replace('
','\n')+'\n' if one_chat[0] else '') +"Assistant: "+one_chat[1].replace('
','\n')+'\n' for one_chat in history_true[-slider_context_times:] ])
35 | prompt += "Human: "+history[-1][0].replace('
','\n')+"\nAssistant:"
36 | input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda')
37 | generate_input = {
38 | "input_ids":input_ids,
39 | "max_new_tokens":512,
40 | "do_sample":True,
41 | "top_k":50,
42 | "top_p":top_p,
43 | "temperature":temperature,
44 | "repetition_penalty":1.3,
45 | "streamer":streamer,
46 | "eos_token_id":tokenizer.eos_token_id,
47 | "bos_token_id":tokenizer.bos_token_id,
48 | "pad_token_id":tokenizer.pad_token_id
49 | }
50 | thread = Thread(target=model.generate, kwargs=generate_input)
51 | thread.start()
52 | start_time = time.time()
53 | bot_message =''
54 | print('Human:',history[-1][0])
55 | print('Assistant: ',end='',flush=True)
56 | for new_text in streamer:
57 | print(new_text,end='',flush=True)
58 | if len(new_text)==0:
59 | continue
60 | if new_text!='':
61 | bot_message+=new_text
62 | if 'Human:' in bot_message:
63 | bot_message = bot_message.split('Human:')[0]
64 | history[-1][1] = bot_message
65 | yield history
66 | end_time =time.time()
67 | print()
68 | print('生成耗时:',end_time-start_time,'文字长度:',len(bot_message),'字耗时:',(end_time-start_time)/len(bot_message))
69 |
70 | msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
71 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
72 | )
73 | sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
74 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
75 | )
76 | re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot )
77 | clear.click(lambda: [], None, chatbot, queue=False)
78 |
79 | if __name__ == "__main__":
80 | parser = argparse.ArgumentParser()
81 | parser.add_argument("--model_name_or_path", type=str, help='mode name or path')
82 | parser.add_argument("--is_4bit", action='store_true', help='use 4bit model')
83 | args = parser.parse_args()
84 | # tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False)
85 | # tokenizer.pad_token = tokenizer.eos_token
86 | if args.is_4bit==False:
87 | config = PeftConfig.from_pretrained(args.model_name_or_path)
88 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,use_fast=False)
89 | tokenizer.pad_token = tokenizer.eos_token
90 | model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
91 | device_map='cuda:0' if torch.cuda.is_available() else "auto",
92 | torch_dtype=torch.float16,
93 | load_in_8bit=True,
94 | low_cpu_mem_usage=True,
95 | trust_remote_code=True,
96 | use_flash_attention_2=True)
97 | model = PeftModel.from_pretrained(model, args.model_name_or_path, device_map={"": 0})
98 | model.eval()
99 | else:
100 | from auto_gptq import AutoGPTQForCausalLM
101 | model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
102 | streamer = TextIteratorStreamer(tokenizer,skip_prompt=True)
103 | if torch.__version__ >= "2" and sys.platform != "win32":
104 | model = torch.compile(model)
105 | demo.queue().launch(share=False, debug=True,server_name="0.0.0.0")
106 |
--------------------------------------------------------------------------------
/examples/llama2_for_langchain.py:
--------------------------------------------------------------------------------
1 | from langchain.llms.base import LLM
2 | from typing import Dict, List, Any, Optional
3 | import torch,sys,os
4 | from transformers import AutoTokenizer
5 |
6 |
7 | class Llama2(LLM):
8 | max_token: int = 2048
9 | temperature: float = 0.1
10 | top_p: float = 0.95
11 | tokenizer: Any
12 | model: Any
13 |
14 | def __init__(self, model_name_or_path, bit4=False):
15 | super().__init__()
16 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=False)
17 | self.tokenizer.pad_token = self.tokenizer.eos_token
18 | if bit4==False:
19 | from transformers import AutoModelForCausalLM
20 | device_map = "cuda:0" if torch.cuda.is_available() else "auto"
21 | self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
22 | self.model.eval()
23 | else:
24 | from auto_gptq import AutoGPTQForCausalLM
25 | self.model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
26 |
27 | if torch.__version__ >= "2" and sys.platform != "win32":
28 | self.model = torch.compile(self.model)
29 |
30 | @property
31 | def _llm_type(self) -> str:
32 | return "Llama2"
33 |
34 | def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
35 | print('prompt:',prompt)
36 | input_ids = self.tokenizer(prompt, return_tensors="pt",add_special_tokens=False).input_ids.to('cuda')
37 | generate_input = {
38 | "input_ids":input_ids,
39 | "max_new_tokens":1024,
40 | "do_sample":True,
41 | "top_k":50,
42 | "top_p":self.top_p,
43 | "temperature":self.temperature,
44 | "repetition_penalty":1.2,
45 | "eos_token_id":self.tokenizer.eos_token_id,
46 | "bos_token_id":self.tokenizer.bos_token_id,
47 | "pad_token_id":self.tokenizer.pad_token_id
48 | }
49 | generate_ids = self.model.generate(**generate_input)
50 | generate_ids = [item[len(input_ids[0]):-1] for item in generate_ids]
51 | result_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
52 | return result_message
53 |
--------------------------------------------------------------------------------
/inference-speed/CPU/ggml/README.md:
--------------------------------------------------------------------------------
1 | ## 使用llama.cpp量化部署
2 |
3 | 以[llama.cpp工具](https://github.com/Rayrtfr/llama.cpp)为例,介绍模型量化并在本地部署的详细步骤。Windows则可能需要cmake等编译工具的安装。**本地快速部署体验推荐使用经过指令精调的[Atom-7B-Chat](https://github.com/LlamaFamily/Llama-Chinese?tab=readme-ov-file#%E5%9F%BA%E4%BA%8Ellama2%E7%9A%84%E4%B8%AD%E6%96%87%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8Batom)模型,有条件的推荐使用6-bit或者8-bit模型,效果更佳。** 运行前请确保:
4 |
5 | 1. 系统应有`make`(MacOS/Linux自带)或`cmake`(Windows需自行安装)编译工具
6 | 2. 建议使用Python 3.10以上编译和运行该工具
7 |
8 |
9 | ### Step 1: 克隆和编译llama.cpp
10 |
11 | 1. (可选)如果已下载旧版仓库,建议`git pull`拉取最新代码,**并执行`make clean`进行清理**
12 | 1. 拉取最新版适配过Atom大模型的llama.cpp仓库代码
13 |
14 | ```bash
15 | $ git clone https://github.com/Rayrtfr/llama.cpp
16 | ```
17 |
18 | 2. 对llama.cpp项目进行编译,生成`./main`(用于推理)和`./quantize`(用于量化)二进制文件。
19 |
20 | ```bash
21 | $ make
22 | ```
23 |
24 | **Windows/Linux用户**如需启用GPU推理,则推荐与[BLAS(或cuBLAS如果有GPU)一起编译](https://github.com/Rayrtfr/llama.cpp#blas-build),可以提高prompt处理速度。以下是和cuBLAS一起编译的命令,适用于NVIDIA相关GPU。参考:[llama.cpp#blas-build](https://github.com/Rayrtfr/llama.cpp#blas-build)
25 |
26 | ```bash
27 | $ make LLAMA_CUBLAS=1
28 | ```
29 |
30 | **macOS用户**无需额外操作,llama.cpp已对ARM NEON做优化,并且已自动启用BLAS。M系列芯片推荐使用Metal启用GPU推理,显著提升速度。只需将编译命令改为:`LLAMA_METAL=1 make`,参考[llama.cpp#metal-build](https://github.com/Rayrtfr/llama.cpp#metal-build)
31 |
32 | ```bash
33 | $ LLAMA_METAL=1 make
34 | ```
35 |
36 | ### Step 2: 生成量化版本模型
37 |
38 | 目前llama.cpp已支持`.safetensors`文件以及huggingface格式`.bin`转换为GGUF的FP16格式。
39 |
40 | /path/Atom-7B-Chat是模型下载的目录位置。
41 | ```bash
42 | $ python convert.py --outfile ./atom-7B-cpp.gguf /path/Atom-7B-Chat
43 |
44 | $ ./quantize ./atom-7B-cpp.gguf ./ggml-atom-7B-q4_0.gguf q4_0
45 | ```
46 |
47 | ### Step 3: 加载并启动模型
48 |
49 |
50 | - 如果想使用GPU推理:cuBLAS/Metal编译需要指定offload层数,在`./main`中指定例如`-ngl 40`表示offload 40层模型参数到GPU
51 |
52 |
53 | 使用以下命令启动聊天。
54 | ```bash
55 | text="Human: 介绍一下北京\nAssistant:"
56 | ./main -m \
57 | ./ggml-atom-7B-q4_0.gguf \
58 | -p "${text}" \
59 | --logdir ./logtxt
60 | ```
61 | 如果要带聊天的上下文,上面的text需要调整成类似这样:
62 | ```bash
63 | text="Human: 介绍一下北京\nAssistant:北京是一个美丽的城市\nHuman: 再介绍一下合肥\nAssistant:"
64 | ```
65 |
66 | 更详细的官方说明请参考:[https://github.com/ggerganov/llama.cpp/tree/master/examples/main](https://github.com/ggerganov/llama.cpp/tree/master/examples/main)
67 |
--------------------------------------------------------------------------------
/inference-speed/GPU/FasterTransformer_example/README.md:
--------------------------------------------------------------------------------
1 | # FasterTransformer && Triton 安装和使用
2 |
3 | FasterTransformer & Triton 加速LLama2模型推理。 目前支持fp16或者Int8推理,Int4目前还不支持。
4 |
5 | ## 0. 准备环境变量
6 |
7 | ```bash
8 | export BUILD_DICTIONARY="/workspace/build"
9 | export TRITON_VERSION=23.04
10 | ```
11 |
12 |
13 | ## 一. 镜像构建
14 |
15 |
16 | 1. 构建镜像
17 |
18 | ```bash
19 | cd $BUILD_DICTIONARY
20 | git clone https://github.com/Rayrtfr/fastertransformer_backend.git
21 |
22 | cd $BUILD_DICTIONARY/fastertransformer_backend
23 |
24 | export TRITON_VERSION=23.04
25 |
26 | # 如何不想通过下面的命令构建,也可以直接下载我们已经构建好的镜像: docker pull xiangtao1994/atom_triton_ft:23.04
27 | docker build --build-arg TRITON_VERSION=${TRITON_VERSION} -t triton_ft_backend:${TRITON_VERSION} -f docker/Dockerfile .
28 |
29 | ```
30 | TRITON_VERSION=23.04 这个镜像需的GPU的驱动版本是 Driver Version: 535.54.03,如果你的GPU的驱动不是这个版本,需要[https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12)
31 | 找到cuda driver 对应版本的 triton-inference-server。
32 |
33 |
34 | 2.启动容器
35 |
36 | ```
37 | # 启动容器
38 | export TRITON_VERSION=23.04
39 |
40 | # 注意需要 BUILD_DICTIONARY 挂载到容器里面
41 | docker run -idt --gpus=all --net=host --shm-size=4G --name triton_ft_backend_pure \
42 | -v $BUILD_DICTIONARY:$BUILD_DICTIONARY \
43 | -p18888:8888 -p18000:8000 -p18001:8001 -p18002:8002 triton_ft_backend:${TRITON_VERSION} bash
44 |
45 | ````
46 |
47 | ## 二.容器内操作
48 |
49 | 下面介绍一下[Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)模型的权重转换成FasterTransformer格式。 [Llama2-Chinese-13b-Chat](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat)也是类似的方式。
50 |
51 | 1. 转换权重, 权重转换成FasterTransformer格式
52 |
53 | ```
54 | cd $BUILD_DICTIONARY && git clone https://github.com/Rayrtfr/FasterTransformer.git
55 |
56 | cd $BUILD_DICTIONARY/FasterTransformer
57 |
58 | mkdir models && chmod -R 777 ./*
59 |
60 | python3 ./examples/cpp/llama/huggingface_llama_convert.py \
61 | -saved_dir=./models/llama \
62 | -in_file=/path/FlagAlpha/Atom-7B-Chat \
63 | -infer_gpu_num=1 \
64 | -weight_data_type=fp16 \
65 | -model_name=llama
66 | ```
67 |
68 | 2. 修改模型配置
69 |
70 | - 编辑config.pbtxt
71 |
72 | ``` bash
73 | mkdir $BUILD_DICTIONARY/triton-model-store/
74 |
75 | cd $BUILD_DICTIONARY/triton-model-store/
76 |
77 | cp -r $BUILD_DICTIONARY/fastertransformer_backend/all_models/llama $BUILD_DICTIONARY/triton-model-store/
78 |
79 | # 修改 triton-model-store/llama/fastertransformer/config.pbtxt
80 |
81 | parameters {
82 | key: "tensor_para_size"
83 | value: {
84 | string_value: "1"
85 | }
86 | }
87 |
88 | ## 修改 model_checkpoint_path 为上面转换之后的路径
89 | parameters {
90 | key: "model_checkpoint_path"
91 | value: {
92 | string_value: "/workspace/build/FasterTransformer/models/llama/1-gpu/"
93 | }
94 | }
95 |
96 | ## 模型使用int8推理需要加一下面的配置
97 | parameters {
98 | key: "int8_mode"
99 | value: {
100 | string_value: "1"
101 | }
102 | }
103 | ```
104 |
105 |
106 | 修改 model.py
107 |
108 | ```
109 | # 修改这两个文件
110 | triton-model-store/llama/preprocessing/1/model.py
111 | triton-model-store/llama/postprocessing/1/model.py
112 |
113 | # 检查 这个路径为tokenier对应的路径
114 | self.tokenizer = LlamaTokenizer.from_pretrained("/path/FlagAlpha/Atom-7B-Chat")
115 | ```
116 |
117 |
118 | 3. 编译 FasterTransformer Library
119 |
120 | (同一类型的模型,编译一次就行了)
121 | 编译之前检查 FasterTransformer/examples/cpp/llama/llama_config.ini
122 |
123 | ```bash
124 | # 单卡推理这里是1,多卡可以改成卡的数目
125 | tensor_para_size=1
126 |
127 | model_dir=/workspace/build/FasterTransformer/models/llama/1-gpu/
128 | ```
129 |
130 | 编译 FasterTransformer
131 | ```bash
132 | cd $BUILD_DICTIONARY/FasterTransformer
133 |
134 | git submodule init && git submodule update
135 | pip3 install fire jax jaxlib transformers
136 |
137 | mkdir build && cd build
138 | cmake -DSM=86 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON -D PYTHON_PATH=/usr/bin/python3 ..
139 | make -j12
140 | make install
141 | ```
142 |
143 |
144 | ## 三. 启动 triton server
145 |
146 | 同样在上面的容器内操作。
147 | ```
148 | CUDA_VISIBLE_DEVICES=0 /opt/tritonserver/bin/tritonserver --model-repository=$BUILD_DICTIONARY/triton-model-store/llama/
149 | ```
150 | 输出
151 | ```
152 | I0717 17:17:14.670037 70681 grpc_server.cc:2450] Started GRPCInferenceService at 0.0.0.0:8001
153 | I0717 17:17:14.670495 70681 http_server.cc:3555] Started HTTPService at 0.0.0.0:8000
154 | I0717 17:17:14.713000 70681 http_server.cc:185] Started Metrics Service at 0.0.0.0:8002
155 | ```
156 |
157 |
158 | 同样在上面的容器内操作,启动client测试(如果在容器外注意需要修改下面的url参数的端口号)
159 |
160 | ```
161 | python3 $BUILD_DICTIONARY/fastertransformer_backend/inference_example/llama/llama_grpc_stream_client.py \
162 | --url 127.0.0.1:8001 \
163 | --hf_model_location /path/FlagAlpha/Atom-7B-Chat \
164 | -topp 0.95
165 | ```
166 |
--------------------------------------------------------------------------------
/inference-speed/GPU/JittorLLMs_example/README.md:
--------------------------------------------------------------------------------
1 | # JittorLLMs推理部署
2 |
3 | ## 配置要求
4 |
5 | * 内存要求:至少2G,推荐32G
6 | * 显存:可选, 推荐16G
7 | * 操作系统:支持Windows,Mac,Linux全平台。
8 | * 磁盘空间:至少40GB空闲磁盘空间,用于下载参数和存储交换文件。
9 | * Python版本要求至少`3.9`。
10 |
11 | 磁盘空间不够时,可以通过环境变量`JITTOR_HOME`指定缓存存放路径。
12 | 内存或者显存不够,出现进程被杀死的情况,请参考下方,[限制内存消耗的方法](#配置要求低)。
13 |
14 | ## 部署方法
15 |
16 | 可以通过下述指令安装依赖。(注意:此脚本会安装Jittor版torch,推荐用户新建环境运行)
17 |
18 | ```
19 | # 国内使用 gitlink clone
20 | git clone https://gitlink.org.cn/jittor/JittorLLMs.git --depth 1
21 | # github: git clone https://github.com/Jittor/JittorLLMs.git --depth 1
22 | cd JittorLLMs
23 | # -i 指定用jittor的源, -I 强制重装Jittor版torch
24 | pip install -r requirements.txt -i https://pypi.jittor.org/simple -I
25 | ```
26 |
27 | 如果出现找不到jittor版本的错误,可能是您使用的镜像还没有更新,使用如下命令更新最新版:`pip install jittor -U -i https://pypi.org/simple`
28 |
29 | 部署只需一行命令即可:
30 |
31 | ```
32 | python cli_demo.py atom7b
33 | ```
34 |
35 | 运行后会自动从服务器上下载模型文件到本地,会占用根目录下一定的硬盘空间。
36 | 最开始运行的时候会编译一些CUDA算子,这会花费一些时间进行加载。
37 |
38 | 内存或者显存不够,出现进程被杀死的情况,请参考下方,[限制内存消耗的方法](#配置要求低)。
39 |
40 | ### WebDemo
41 |
42 | JittorLLM通过gradio库,允许用户在浏览器之中和大模型直接进行对话。
43 |
44 | ~~~bash
45 | python web_demo.py atom7b
46 | ~~~
47 |
48 | ### 后端服务部署
49 |
50 | JittorLLM在api.py文件之中,提供了一个架设后端服务的示例。
51 |
52 | ~~~bash
53 | python api.py atom7b
54 | ~~~
55 |
56 | 接着可以使用如下代码进行直接访问
57 |
58 | ~~~python
59 | post_data = json.dumps({'prompt': 'Hello, solve 5x=13'})
60 | print(json.loads(requests.post("http://0.0.0.0:8000", post_data).text)['response'])
61 | ~~~
62 |
63 | ## 配置要求低
64 |
65 | 针对大模型显存消耗大等痛点,Jittor团队研发了动态交换技术,Jittor框架是世界上首个支持动态图变量自动交换功能的框架,区别于以往的基于静态图交换技术,用户不需要修改任何代码,原生的动态图代码即可直接支持张量交换,张量数据可以在显存-内存-硬盘之间自动交换,降低用户开发难度。
66 |
67 | 同时,Jittor大模型推理库也是目前对配置门槛要求最低的框架,只需要参数磁盘空间和2G内存,无需显卡,也可以部署大模型,下面是在不同硬件配置条件下的资源消耗与速度对比。可以发现,JittorLLMs在显存充足的情况下,性能优于同类框架,而显存不足甚至没有显卡,JittorLLMs都能以一定速度运行。
68 |
69 | 节省内存方法,请安装Jittor版本大于1.3.7.8,并添加如下环境变量:
70 | ```bash
71 | export JT_SAVE_MEM=1
72 | # 限制cpu最多使用16G
73 | export cpu_mem_limit=16000000000
74 | # 限制device内存(如gpu、tpu等)最多使用8G
75 | export device_mem_limit=8000000000
76 | # windows 用户,请使用powershell
77 | # $env:JT_SAVE_MEM="1"
78 | # $env:cpu_mem_limit="16000000000"
79 | # $env:device_mem_limit="8000000000"
80 | ```
81 | 用户可以自由设定cpu和设备内存的使用量,如果不希望对内存进行限制,可以设置为`-1`。
82 | ```bash
83 | # 限制cpu最多使用16G
84 | export cpu_mem_limit=-1
85 | # 限制device内存(如gpu、tpu等)最多使用8G
86 | export device_mem_limit=-1
87 | # windows 用户,请使用powershell
88 | # $env:JT_SAVE_MEM="1"
89 | # $env:cpu_mem_limit="-1"
90 | # $env:device_mem_limit="-1"
91 | ```
92 |
93 | 如果想要清理磁盘交换文件,可以运行如下命令
94 | ```bash
95 | python -m jittor_utils.clean_cache swap
96 | ```
97 |
--------------------------------------------------------------------------------
/inference-speed/GPU/TensorRT-LLM_example/README.md:
--------------------------------------------------------------------------------
1 | # 使用NVIDIA TensorRT-LLM部署LLama2 或者Atom
2 |
3 | [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架,您可以按照以下步骤来使用TensorRT-LLM部署LLama2模型或者Atom模型。
4 |
5 | 以下部署流程参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama),需要机器Nvidia显卡驱动535版本以上
6 |
7 | ## Support Matrix
8 | * FP16
9 | * FP8
10 | * INT8 & INT4 Weight-Only
11 | * SmoothQuant
12 | * Groupwise quantization (AWQ/GPTQ)
13 | * FP8 KV CACHE
14 | * INT8 KV CACHE (+ AWQ/per-channel weight-only)
15 | * Tensor Parallel
16 | * STRONGLY TYPED
17 |
18 | ## 1. 安装TensorRT-LLM
19 | #### 获取TensorRT-LLM代码:
20 |
21 | ```bash
22 | # TensorRT-LLM 代码需要使用 git-lfs 拉取
23 | apt-get update && apt-get -y install git git-lfs
24 |
25 | git clone https://github.com/NVIDIA/TensorRT-LLM.git
26 | cd TensorRT-LLM
27 |
28 | # 本流程将使用 v0.7.0 Release 版本
29 | git checkout tags/v0.7.0 -b release/0.7.0
30 | git submodule update --init --recursive
31 | git lfs install
32 | git lfs pull
33 | ```
34 | #### 构建docker镜像并安装TensorRT-LLM
35 | ```bash
36 | make -C docker release_build
37 | ```
38 |
39 | #### 运行docker镜像:
40 | ```bash
41 | make -C docker release_run
42 | ```
43 |
44 | ## 2. 为LLama2模型构建TensorRT-LLM推理引擎:
45 |
46 | #### 进入build文件夹:
47 | ```bash
48 | cd ./examples/llama
49 | ```
50 |
51 | #### 从Huggingface下载Atom或者LLama2模型:
52 | ```
53 | # 您可以选择具体想部署的模型下载
54 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat
55 | mv Atom-7B-Chat /origin_model
56 | ```
57 |
58 | #### 使用build.py 构建推理引擎:
59 | 以下是一个常见事例,更多参数参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)
60 | ```bash
61 | python build.py --max_batch_size 1 --max_num_tokens 8192 --model_dir /origin_model --dtype float16 --remove_input_padding --use_inflight_batching --paged_kv_cache --use_weight_only --enable_context_fmha --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --output_dir /model/tensorrt_llm/1 --world_size 1 --tp_size 1 --pp_size 1 --max_input_len 7168 --max_output_len 1024 --multi_block_mode --rotary_scaling dynamic 8.0 --rotary_base 500000
62 | ```
63 |
64 | ## 3. 使用TensorRT-LLM Python Runtime进行推理
65 |
66 | #### 使用我们提供的python代码类,启动单机单卡服务
67 | ```bash
68 | python atom_inference.py \
69 | /model/tensorrt_llm/1 \ # 第一个参数 build.py 的output路径
70 | /origin_model \ # 第二个参数模型tokenizer的路径
71 | 如何成为一个更加优秀的人 # 希望问的问题
72 | ```
--------------------------------------------------------------------------------
/inference-speed/GPU/TensorRT-LLM_example/atom_inference.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import sys
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | import torch
7 | from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
8 | load_tokenizer, read_model_name, throttle_generator)
9 |
10 | import tensorrt_llm
11 | from tensorrt_llm.logger import logger
12 | from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
13 |
14 | if PYTHON_BINDINGS:
15 | from tensorrt_llm.runtime import ModelRunnerCpp
16 |
17 | class AtomTRTApi:
18 | def __init__(self,engine_dir,tokenizer_dir,max_input_length=4096):
19 | self.runtime_rank = tensorrt_llm.mpi_rank()
20 | self.model_name = read_model_name(engine_dir)
21 |
22 | self.tokenizer, self.pad_id, self.end_id = load_tokenizer(
23 | tokenizer_dir=tokenizer_dir,
24 | tokenizer_type='llama',
25 | )
26 | self.use_py_session=False
27 | if not PYTHON_BINDINGS:
28 | logger.warning(
29 | "Python bindings of C++ session is unavailable, fallback to Python session."
30 | )
31 | self.use_py_session = True
32 | runner_cls = ModelRunner if self.use_py_session else ModelRunnerCpp
33 | runner_kwargs = dict(engine_dir=engine_dir,
34 | lora_dir=None,
35 | rank=self.runtime_rank,
36 | debug_mode=False,
37 | lora_ckpt_source='hf')
38 |
39 | if not self.use_py_session:
40 | runner_kwargs.update(
41 | max_batch_size=1,
42 | max_input_len=max_input_length,
43 | max_output_len=2048,
44 | max_beam_width=1,
45 | max_attention_window_size=None)
46 | self.runner = runner_cls.from_dir(**runner_kwargs)
47 |
48 |
49 | def ask(self,input_text,temperature=0.4,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',merge_lambda=None,max_input_length=4096,append_next_role=True):
50 | with torch.no_grad():
51 | prompt = ''
52 | print('max_input_length',max_input_length)
53 | if type(input_text)==list:
54 | for input_text_one in input_text[::-1]:
55 | if len(prompt) + len(""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n")" + prompt
57 | if append_next_role:
58 | if input_text[-1]['role']=='Human':
59 | prompt += "Assistant:"
60 | else:
61 | prompt += "Human:"
62 | else:
63 | if merge_lambda is None:
64 | if append_next_role:
65 | prompt += "Human: "+input_text.strip()+"\nAssistant:"
66 | else:
67 | prompt += "Human: "+input_text.strip()+"\n"
68 | else:
69 | prompt += merge_lambda(input_text)
70 | if len(system_prefix)>0:
71 | prompt = 'System: '+system_prefix.strip()+'\n'+prompt
72 | print('输入模型的完整输入:',prompt)
73 | input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids]
74 | print(input_ids)
75 | input_ids = [
76 | torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids
77 | ]
78 | print('输入模型的token数量',input_ids[0].shape)
79 | generate_input = {
80 | "batch_input_ids":input_ids,
81 | "max_new_tokens":max_new_tokens,
82 | "max_attention_window_size":None,
83 | "do_sample":True,
84 | "top_k":50,
85 | "top_p":top_p,
86 | "num_beams":1,
87 | "length_penalty":1.0,
88 | "stop_words_list":None,
89 | "bad_words_list":None,
90 | "streaming":False,
91 | "temperature":temperature,
92 | "output_sequence_lengths":True,
93 | "return_dict":False,
94 | "repetition_penalty":repetition_penalty,
95 | "end_id":self.tokenizer.eos_token_id,
96 | "bos_token_id":self.tokenizer.bos_token_id,
97 | "pad_id":self.tokenizer.pad_token_id
98 | }
99 | generate_ids = self.runner.generate(**generate_input)
100 | torch.cuda.synchronize()
101 | print(generate_ids)
102 | generate_ids = generate_ids.cpu().tolist()
103 | generate_ids = [item[0][len(input_ids[0][0]):] for item in generate_ids]
104 | try:
105 | generate_ids = [item[:item.index(self.tokenizer.eos_token_id)] for item in generate_ids ]
106 | except:
107 | pass
108 | print(generate_ids)
109 | # output = ''.join(tokenizer.convert_ids_to_tokens(generate_ids[0]))
110 | # print('生成的token长度',len(generate_ids[0]))
111 | bot_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
112 | if 'Human:' in bot_message:
113 | bot_message = bot_message.split('Human:')[0]
114 | print(bot_message)
115 | return bot_message.strip()
116 |
117 | def ask_streaming(self,input_text,temperature=0.8,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',max_input_length=4096,append_next_role=True):
118 | with torch.no_grad():
119 | prompt = ''
120 | print('max_input_length',max_input_length)
121 | if type(input_text)==list:
122 | for input_text_one in input_text[::-1]:
123 | if len(prompt) + len(""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n")" + prompt
125 | if append_next_role:
126 | if input_text[-1]['role']=='Human':
127 | prompt += "Assistant:"
128 | else:
129 | prompt += "Human:"
130 | else:
131 | if append_next_role:
132 | prompt += "Human: "+input_text.strip()+"\nAssistant:"
133 | else:
134 | prompt += "Human: "+input_text.strip()+"\n"
135 | if len(system_prefix)>0:
136 | prompt = 'System: '+system_prefix.strip()+'\n'+prompt
137 | print('输入模型的完整输入:',prompt)
138 | input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids]
139 | print(input_ids)
140 | input_ids = [
141 | torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids
142 | ]
143 | print('输入模型的token数量',input_ids[0].shape)
144 | generate_input = {
145 | "batch_input_ids":input_ids,
146 | "max_new_tokens":max_new_tokens,
147 | "max_attention_window_size":None,
148 | "do_sample":True,
149 | "top_k":50,
150 | "top_p":top_p,
151 | "num_beams":1,
152 | "length_penalty":1.0,
153 | "stop_words_list":None,
154 | "bad_words_list":None,
155 | "streaming":True,
156 | "temperature":temperature,
157 | "output_sequence_lengths":True,
158 | "return_dict":True,
159 | "repetition_penalty":repetition_penalty,
160 | "end_id":self.tokenizer.eos_token_id,
161 | "bos_token_id":self.tokenizer.bos_token_id,
162 | "pad_id":self.tokenizer.pad_token_id
163 | }
164 | generate_ids = self.runner.generate(**generate_input)
165 | torch.cuda.synchronize()
166 |
167 | input_token_num = len(input_ids[0][0])
168 | answer_message =''
169 | for curr_outputs in throttle_generator(generate_ids,2):
170 | output_ids = curr_outputs['output_ids']
171 | sequence_lengths = curr_outputs['sequence_lengths']
172 | # print(sequence_lengths)
173 | output_ids = output_ids.cpu().tolist()
174 | output_ids = [item[0][input_token_num:sequence_lengths[0][0]] for item in output_ids]
175 | answer_message = self.tokenizer.batch_decode(output_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
176 | if 'Human:' in answer_message:
177 | answer_message = answer_message.split('Human:')[0]
178 | yield answer_message.strip()
179 | return answer_message.strip()
180 |
181 |
182 | if __name__=='__main__':
183 | model = AtomTRTApi(engine_dir=sys.argv[1],tokenizer_dir=sys.argv[2])
184 | model.ask('如何成为一个更优秀的人')
185 |
--------------------------------------------------------------------------------
/inference-speed/GPU/TensorRT-LLM_example/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import json
17 | from pathlib import Path
18 | from typing import Optional
19 | from typing import Union
20 |
21 | from transformers import AutoTokenizer, T5Tokenizer
22 |
23 | import tensorrt_llm
24 |
25 | DEFAULT_HF_MODEL_DIRS = {
26 | 'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
27 | 'bloom': 'bigscience/bloom-560m',
28 | 'chatglm_6b': 'THUDM/chatglm-6b',
29 | 'chatglm2_6b': 'THUDM/chatglm2-6b',
30 | 'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k',
31 | 'chatglm3_6b': 'THUDM/chatglm3-6b',
32 | 'chatglm3_6b_base': 'THUDM/chatglm3-6b-base',
33 | 'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k',
34 | 'falcon': 'tiiuae/falcon-rw-1b',
35 | 'glm_10b': 'THUDM/glm-10b',
36 | 'gpt': 'gpt2-medium',
37 | 'gptj': 'EleutherAI/gpt-j-6b',
38 | 'gptneox': 'EleutherAI/gpt-neox-20b',
39 | 'internlm': 'internlm/internlm-chat-7b',
40 | 'llama': 'meta-llama/Llama-2-7b-hf',
41 | 'mpt': 'mosaicml/mpt-7b',
42 | 'phi': 'microsoft/phi-2',
43 | 'opt': 'facebook/opt-350m',
44 | 'qwen': 'Qwen/Qwen-7B',
45 | }
46 |
47 | DEFAULT_PROMPT_TEMPLATES = {
48 | 'internlm':
49 | "<|User|>:{input_text}\n<|Bot|>:",
50 | 'qwen':
51 | "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
52 | }
53 |
54 | def get_engine_version(engine_dir: str) -> Union[None, str]:
55 | engine_dir = Path(engine_dir)
56 | config_path = engine_dir / "config.json"
57 | with open(config_path, 'r') as f:
58 | config = json.load(f)
59 |
60 | if 'version' not in config:
61 | return None
62 |
63 | return config['version']
64 |
65 | def read_model_name(engine_dir: str):
66 | engine_version = get_engine_version(engine_dir)
67 |
68 | with open(Path(engine_dir) / "config.json", 'r') as f:
69 | config = json.load(f)
70 |
71 | if engine_version is None:
72 | return config['builder_config']['name']
73 |
74 | return config['pretrained_config']['architecture']
75 |
76 |
77 | def throttle_generator(generator, stream_interval):
78 | for i, out in enumerate(generator):
79 | if not i % stream_interval:
80 | yield out
81 |
82 | if i % stream_interval:
83 | yield out
84 |
85 |
86 | def load_tokenizer(tokenizer_dir: Optional[str] = None,
87 | vocab_file: Optional[str] = None,
88 | model_name: str = 'gpt',
89 | tokenizer_type: Optional[str] = None):
90 | if vocab_file is None:
91 | use_fast = True
92 | if tokenizer_type is not None and tokenizer_type == "llama":
93 | use_fast = False
94 | # Should set both padding_side and truncation_side to be 'left'
95 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
96 | legacy=False,
97 | padding_side='left',
98 | truncation_side='left',
99 | trust_remote_code=True,
100 | tokenizer_type=tokenizer_type,
101 | use_fast=use_fast)
102 | else:
103 | # For gpt-next, directly load from tokenizer.model
104 | assert model_name == 'gpt'
105 | tokenizer = T5Tokenizer(vocab_file=vocab_file,
106 | padding_side='left',
107 | truncation_side='left')
108 |
109 | if model_name == 'qwen':
110 | with open(Path(tokenizer_dir) / "generation_config.json") as f:
111 | gen_config = json.load(f)
112 | chat_format = gen_config['chat_format']
113 | if chat_format == 'raw':
114 | pad_id = gen_config['pad_token_id']
115 | end_id = gen_config['eos_token_id']
116 | elif chat_format == 'chatml':
117 | pad_id = tokenizer.im_end_id
118 | end_id = tokenizer.im_end_id
119 | else:
120 | raise Exception(f"unknown chat format: {chat_format}")
121 | elif model_name == 'glm_10b':
122 | pad_id = tokenizer.pad_token_id
123 | end_id = tokenizer.eop_token_id
124 | else:
125 | if tokenizer.pad_token_id is None:
126 | tokenizer.pad_token_id = tokenizer.eos_token_id
127 | pad_id = tokenizer.pad_token_id
128 | end_id = tokenizer.eos_token_id
129 |
130 | return tokenizer, pad_id, end_id
131 |
--------------------------------------------------------------------------------
/inference-speed/GPU/lmdeploy_example/README.md:
--------------------------------------------------------------------------------
1 | # lmdeploy 安装和使用
2 |
3 | lmdeploy 支持 transformer 结构(例如 Atom、LLaMA、LLaMa2、InternLM、Vicuna 等),目前支持 fp16,int8 和 int4。
4 |
5 | ## 一、安装
6 |
7 | 安装预编译的 python 包
8 | ```
9 | python3 -m pip install lmdeploy==0.2.1
10 | ```
11 |
12 | ## 二、转换huggingface模型为lmdeploy格式
13 |
14 | 把模型转成 lmdeploy 推理格式,假设 huggingface 版 [Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat) 模型已下载到 `/models/Atom-7B-Chat` 目录,结果会存到 当前执行命令的`workspace` 文件夹
15 |
16 | ```shell
17 | lmdeploy convert llama2 /models/Atom-7B-Chat
18 | ```
19 | lmdeploy 修改一处bug
20 | ```
21 | sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/preprocessing/1/tokenizer/tokenizer.py
22 | sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/postprocessing/1/tokenizer/tokenizer.py
23 | ```
24 |
25 |
26 | ## 三、kv cache int8 量化
27 | 对于最大长度是 2048 的 Atom-7B fp16 模型,服务端每创建 1 个并发,都需要大约 1030MB 显存保存 kv_cache,即便是 A100 80G,能服务的用户也非常有限。
28 | 为了降低运行时显存,lmdeploy 实现了 kv cache PTQ 量化,同样的显存可以服务更多并发用户。
29 | 首先计算模型参数,保存到临时目录 atom
30 | ```shell
31 | mkdir atom
32 | lmdeploy lite calibrate \
33 | /models/Atom-7B-Chat \ # huggingface Atom 模型。也支持 llama/vicuna/internlm/baichuan 等
34 | --calib-dataset 'ptb' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval
35 | --calib-samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小
36 | --device 'cuda' \ # 单条的文本长度,如果显存不够,可以适当调小
37 | --work-dir atom # 保存 pth 格式量化统计参数和量化后权重的文件夹
38 | ```
39 | 注意:可能需要安装flash_attn
40 | ```shell
41 | conda install -c nvidia cuda-nvcc # 为了使用conda内的cuda环境安装 flash_attn
42 | pip install flash_attn
43 | ```
44 |
45 |
46 | 然后用 atom 目录里的参数,计算量化参数,保存到转换后参数到 `workspace/triton_models/weights` 下
47 |
48 | ```shell
49 | lmdeploy lite kv_qparams \
50 | ./atom \ # 上一步计算的 atom 结果
51 | ./workspace/triton_models/weights \ # 结果保存目录
52 | --num-tp 1 # tensor parallel GPU 个数
53 | ```
54 |
55 | 修改推理配置,开启 kv cache int8。编辑 `workspace/triton_models/weights/config.ini`
56 | * 把 `use_context_fmha` 改为 0,表示关闭 flashattention
57 | * 把 `quant_policy` 设为 4,表示打开 kv cache 量化
58 |
59 | 最终执行测试即可
60 | ```shell
61 | lmdeploy chat turbomind ./workspace
62 | ```
63 |
64 | [点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/kv_int8.md) 查看 kv cache int8 量化实现公式、精度和显存测试报告。
65 |
66 | ## 四、weight int4 量化
67 |
68 | lmdeploy 基于 [AWQ 算法](https://arxiv.org/abs/2306.00978) 实现了 weight int4 量化,性能是 FP16 的 2.4 倍以上。显存从 16G 降低到 6.3G。
69 |
70 | 对于自己的模型,可以用`auto_awq`工具来优化
71 | ```shell
72 | # 指定量化导出的模型路径
73 | WORK_DIR="./atom-7b-chta-w4"
74 |
75 | lmdeploy lite auto_awq \
76 | $HF_MODEL \ # huggingface 模型位置
77 | --calib-dataset 'ptb' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval
78 | --calib-samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小
79 | --calib-seqlen 2048 \ # 单条的文本长度,如果显存不够,可以适当调小
80 | --w-bits 4 \ # 权重量化的 bit 数
81 | --w-group-size 128 \ # 权重量化分组统计尺寸
82 | --work-dir $WORK_DIR
83 | ```
84 |
85 | 执行以下命令,启动服务:
86 | ```shell
87 | # 这里的路径是上面步骤一中转换模型的layout的输出
88 | FasterTransformer_PATH="/path/workspace"
89 |
90 | TP=1
91 | # 指定需要用的显卡
92 | DEVICES="0"
93 | for ((i = 1; i < ${TP}; ++i)); do
94 | DEVICES="${DEVICES},$i"
95 | done
96 | DEVICES="\"device=${DEVICES}\""
97 |
98 | # 在容器内启动服务
99 | docker run -idt \
100 | --gpus $DEVICES \
101 | -v $FasterTransformer_PATH:/workspace/models \
102 | --shm-size 16g \
103 | -p 33336:22 \
104 | -p 33337-33400:33337-33400 \
105 | --cap-add=SYS_PTRACE \
106 | --cap-add=SYS_ADMIN \
107 | --security-opt seccomp=unconfined \
108 | --name lmdeploy \
109 | --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
110 | tritonserver \
111 | --model-repository=/workspace/models/model_repository \
112 | --allow-http=0 \
113 | --allow-grpc=1 \
114 | --grpc-port=33337 \
115 | --log-verbose=0 \
116 | --allow-metrics=1
117 | ```
118 |
119 | 客户端测试:
120 | ```shell
121 | python test_api_server.py --tritonserver_addr 127.0.0.1:33337
122 | ```
123 |
124 | [点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md) 查看 weight int4 量化的显存和速度测试结果。
125 |
126 | 额外说明,weight int4 和 kv cache int8 二者并不冲突、可以同时打开,节约更多显存。
127 |
--------------------------------------------------------------------------------
/inference-speed/GPU/lmdeploy_example/test_api_server.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from lmdeploy.serve.turbomind.chatbot import Chatbot
4 |
5 | def input_prompt(chat_history, system_prompt: str):
6 | """Input a prompt in the consolo interface."""
7 | prompt = ''
8 | for input_text_one in chat_history:
9 | prompt += ""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n"
10 | if chat_history[-1]['role']=='Human':
11 | prompt += "Assistant: "
12 | else:
13 | prompt += "Human: "
14 | prompt = prompt[-2048:]
15 | if len(system_prompt)>0:
16 | prompt = 'System: '+system_prompt.strip()+'\n'+prompt
17 |
18 | return prompt
19 |
20 | def main(tritonserver_addr: str,
21 | session_id: int = 1,
22 | cap: str = 'chat',
23 | stream_output: bool = True,
24 | **kwargs):
25 | """An example to communicate with inference server through the command line
26 | interface.
27 |
28 | Args:
29 | tritonserver_addr (str): the address in format "ip:port" of
30 | triton inference server
31 | session_id (int): the identical id of a session
32 | cap (str): the capability of a model. For example, codellama has
33 | the ability among ['completion', 'infill', 'instruct', 'python']
34 | stream_output (bool): indicator for streaming output or not
35 | **kwargs (dict): other arguments for initializing model's chat template
36 | """
37 | log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
38 | kwargs.update(capability=cap)
39 | chatbot = Chatbot(tritonserver_addr,
40 | log_level=log_level,
41 | display=stream_output,
42 | **kwargs)
43 | nth_round = 1
44 | prompt = input_prompt([{"role": "Human", "content" : "心情不好怎么办"}], "")
45 |
46 | request_id = f'{session_id}-{nth_round}'
47 | begin = time.time()
48 | if stream_output:
49 | for status, res, n_token in chatbot.stream_infer(
50 | session_id,
51 | prompt,
52 | request_id=request_id,
53 | request_output_len=512):
54 | # print("n_token:", n_token)
55 | continue
56 |
57 | else:
58 | status, res, n_token = chatbot.infer(session_id,
59 | prompt,
60 | request_id=request_id,
61 | request_output_len=512)
62 | print(res)
63 | # print("n_token:", n_token)
64 | nth_round += 1
65 | end = time.time()
66 | speed = n_token/(end-begin)
67 | print("speed {} tokens/s".format(speed))
68 |
69 |
70 | if __name__ == '__main__':
71 | import fire
72 |
73 | fire.Fire(main)
74 |
--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/README.md:
--------------------------------------------------------------------------------
1 | # vllm推理部署
2 |
3 | [vllm](https://github.com/vllm-project/vllm)同样是GPU推理的方案。相比较与FasterTrainsformer,vllm更加的简单易用。不需要额外进行模型的转换。支持fp16推理。
4 |
5 | 特点:
6 |
7 | + 快速的推理速度
8 | + 高效的kv cache
9 | + 连续的batch请求推理
10 | + 优化cuda算子
11 | + 支持分布式推理
12 |
13 | ## 第一步: 安装vllm
14 |
15 | ```bash
16 | pip install vllm
17 | ```
18 |
19 | ## 第二步:启动测试server
20 |
21 | 从Huggingface下载Atom或者LLama3模型:
22 | ```
23 | # 您可以选择具体想部署的模型下载
24 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat
25 |
26 | # 或者下载Meta官方的Llama3模型:
27 | git clone https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct Meta-Llama-3-8B-Instruct
28 | ```
29 |
30 | 1. 单卡推理
31 |
32 | 编辑single_gpus_api_server.sh里面model为上面模型的下载路径。
33 |
34 | 启动测试server
35 | ```bash
36 | # multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡
37 | bash single_gpus_api_server.sh
38 | ```
39 |
40 | 2. 多卡推理
41 |
42 | 13B模型,70B模型推荐多卡推理。编辑multi_gpus_api_server.sh里面model为上面的13B模型的下载路径。
43 |
44 | 启动测试server
45 | ```bash
46 | # multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡
47 | # tensor-parallel-size 指定了卡的个数
48 | bash multi_gpus_api_server.sh
49 | ```
50 |
51 | ## 第三步:启动client测试
52 |
53 | 注意下面的model_source 模型的源,可以是 llama_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分,如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama_chinese。
54 |
55 | ```
56 | python client_test.py --model_source llama_chinese
57 | ```
58 |
--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/api_server.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | from typing import AsyncGenerator
4 |
5 | from fastapi import BackgroundTasks, FastAPI, Request
6 | from fastapi.responses import JSONResponse, Response, StreamingResponse
7 | import uvicorn
8 |
9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | from vllm.sampling_params import SamplingParams
12 | from vllm.utils import random_uuid
13 |
14 | TIMEOUT_KEEP_ALIVE = 5 # seconds.
15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
16 | app = FastAPI()
17 |
18 |
19 | @app.post("/generate")
20 | async def generate(request: Request) -> Response:
21 | """Generate completion for the request.
22 |
23 | The request should be a JSON object with the following fields:
24 | - prompt: the prompt to use for the generation.
25 | - stream: whether to stream the results or not.
26 | - other fields: the sampling parameters (See `SamplingParams` for details).
27 | """
28 | request_dict = await request.json()
29 | prompt = request_dict.pop("prompt")
30 | stream = request_dict.pop("stream", False)
31 | sampling_params = SamplingParams(**request_dict)
32 | request_id = random_uuid()
33 | results_generator = engine.generate(prompt, sampling_params, request_id)
34 |
35 | # Streaming case
36 | async def stream_results() -> AsyncGenerator[bytes, None]:
37 | async for request_output in results_generator:
38 | prompt = request_output.prompt
39 | text_outputs = [
40 | prompt + output.text for output in request_output.outputs
41 | ]
42 | ret = {"text": text_outputs}
43 | yield (json.dumps(ret) + "\0").encode("utf-8")
44 |
45 | async def abort_request() -> None:
46 | await engine.abort(request_id)
47 |
48 | if stream:
49 | background_tasks = BackgroundTasks()
50 | # Abort the request if the client disconnects.
51 | background_tasks.add_task(abort_request)
52 | return StreamingResponse(stream_results(), background=background_tasks)
53 |
54 | # Non-streaming case
55 | final_output = None
56 | async for request_output in results_generator:
57 | if await request.is_disconnected():
58 | # Abort the request if the client disconnects.
59 | await engine.abort(request_id)
60 | return Response(status_code=499)
61 | final_output = request_output
62 |
63 | assert final_output is not None
64 | prompt = final_output.prompt
65 | text_outputs = [prompt + output.text for output in final_output.outputs]
66 | ret = {"text": text_outputs}
67 | return JSONResponse(ret)
68 |
69 |
70 | if __name__ == "__main__":
71 | parser = argparse.ArgumentParser()
72 | parser.add_argument("--host", type=str, default="0.0.0.0")
73 | parser.add_argument("--port", type=int, default=8090)
74 | parser.add_argument("--trust_remote_code", type=bool, default=True)
75 | parser = AsyncEngineArgs.add_cli_args(parser)
76 | args = parser.parse_args()
77 |
78 | engine_args = AsyncEngineArgs.from_cli_args(args)
79 | engine = AsyncLLMEngine.from_engine_args(engine_args)
80 |
81 | uvicorn.run(app,
82 | host=args.host,
83 | port=args.port,
84 | log_level="debug",
85 | timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
86 |
--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/client_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import json
3 | import time
4 | import argparse
5 |
6 | import urllib.request
7 |
8 | import sys
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--model_source', default="llama_chinese", choices =["llama_chinese", "llama2_meta", "llama3_meta"], required=False,type=str)
12 | args = parser.parse_args()
13 |
14 | def get_prompt_llama_chinese(
15 | chat_history, system_prompt=""
16 | ) -> str:
17 | prompt = ''
18 | for input_text_one in chat_history:
19 | prompt += ""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n"
20 | if chat_history[-1]['role']=='Human':
21 | prompt += "Assistant: "
22 | else:
23 | prompt += "Human: "
24 | prompt = prompt[-2048:]
25 | if len(system_prompt)>0:
26 | prompt = 'System: '+system_prompt.strip()+'\n'+prompt
27 |
28 | return prompt
29 |
30 | def get_prompt_llama2_meta(chat_history, system_prompt=""):
31 | B_INST, E_INST = "[INST]", "[/INST]"
32 | B_SYS, E_SYS = "<>\n", "\n<>\n\n"
33 |
34 | sep = " "
35 | sep2 =" "
36 | stop_token_ids = [2]
37 | system_template = f"[INST] <>\n{system_prompt}\n<>\n\n"
38 | roles = ("[INST]", "[/INST]")
39 | seps = [sep, sep2]
40 | if system_prompt.strip() != "":
41 | ret = system_template
42 | else:
43 | ret = "[INST] "
44 | for i, chat in enumerate(chat_history):
45 | message = chat["content"]
46 | role = chat["role"]
47 | if message:
48 | if i == 0:
49 | ret += message + " "
50 | else:
51 | if role == "Human":
52 | ret += "[INST]" + " " + message + seps[i % 2]
53 | else:
54 | ret += "[/INST]" + " " + message + seps[i % 2]
55 | else:
56 | if role == "Human":
57 | ret += "[INST]"
58 | else:
59 | ret += "[/INST]"
60 | print("prompt:{}".format(ret))
61 | return ret
62 |
63 | def get_prompt_llama3_meta(chat_history, system_prompt=""):
64 | system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
65 | user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>'
66 | assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n'
67 | prompt_str = ''
68 | # 拼接历史对话
69 | for item in chat_history:
70 | if item['role']=='Human':
71 | prompt_str+=user_format.format(content=item['content'])
72 | else:
73 | prompt_str+=assistant_format.format(content=item['content'])
74 | if len(system_prompt)>0:
75 | prompt_str = system_format.format(content=system_prompt) + prompt_str
76 | prompt_str = "<|begin_of_text|>" + prompt_str
77 | return prompt_str
78 |
79 |
80 | def test_api_server(chat_history=[], system_prompt=""):
81 | header = {'Content-Type': 'application/json'}
82 |
83 | if args.model_source == "llama2_meta":
84 | prompt = get_prompt_llama2_meta(chat_history, system_prompt)
85 | elif args.model_source == "llama3_meta":
86 | prompt = get_prompt_llama3_meta(chat_history, system_prompt)
87 | else:
88 | prompt = get_prompt_llama_chinese(chat_history, system_prompt)
89 |
90 | data = {
91 | "prompt": prompt,
92 | "stream" : False,
93 | "n" : 1,
94 | "best_of": 1,
95 | "presence_penalty": 0.0,
96 | "frequency_penalty": 0.2,
97 | "temperature": 0.3,
98 | "top_p" : 0.95,
99 | "top_k": 50,
100 | "use_beam_search": False,
101 | "stop": [],
102 | "ignore_eos" :False,
103 | "max_tokens": 2048,
104 | "logprobs": None
105 | }
106 | request = urllib.request.Request(
107 | url='http://127.0.0.1:8090/generate',
108 | headers=header,
109 | data=json.dumps(data).encode('utf-8')
110 | )
111 |
112 | result = None
113 | try:
114 | response = urllib.request.urlopen(request, timeout=300)
115 | res = response.read().decode('utf-8')
116 | result = json.loads(res)
117 | print(json.dumps(data, ensure_ascii=False, indent=2))
118 | print(json.dumps(result, ensure_ascii=False, indent=2))
119 |
120 | except Exception as e:
121 | print(e)
122 |
123 | return result
124 |
125 | if __name__ == "__main__":
126 | # 多伦对话测试
127 | """ 多伦对话测试
128 | last_question = "怎么回来呢"
129 | inputs = [{"role": "Human", "content": "如何去北京"},
130 | {"role": "Assitant", "content": "乘坐飞机或者轮船"},
131 | {"role" : "Human", "content": last_question}]
132 | """
133 | # 单轮对话
134 | last_question = "怎么去北京"
135 | chat_history = [ {"role" : "Human", "content": last_question}]
136 | test_api_server(chat_history)
137 |
138 |
--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/multi_gpus_api_server.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1 python api_server.py \
2 | --model "./Atom-7B-Chat" \
3 | --port 8090 \
4 | --tensor-parallel-size 2
5 |
--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/single_gpu_api_server.sh:
--------------------------------------------------------------------------------
1 |
2 | CUDA_VISIBLE_DEVICES=0 python api_server.py \
3 | --model "./Atom-7B-Chat" \
4 | --port 8090
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.1.2
2 | bitsandbytes==0.42.0
3 | accelerate==0.27.2
4 | numpy==1.26.4
5 | gekko==1.0.6
6 | pandas
7 | scipy
8 | sentencepiece==0.2.0
9 | datasets
10 | evaluate
11 | pytest
12 | peft==0.8.2
13 | transformers==4.39.0
14 | deepspeed==0.14.0
15 | scikit-learn
16 | torchvision
17 | torchdata
18 | torchaudio
19 | tensorboard
20 | gradio
21 | packaging
--------------------------------------------------------------------------------
/scripts/api/README.md:
--------------------------------------------------------------------------------
1 | # API 调用
2 |
3 | ```
4 | 您可以选择具体想部署的模型下载
5 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat
6 | mv Atom-7B-Chat /path/origin_model
7 | ```
8 |
9 | 首先需要安装额外的依赖 `pip install fastapi uvicorn`,然后运行仓库中的 [accelerate_server.py](accelerate_server.py):
10 |
11 | ```bash
12 | python accelerate_server.py \
13 | --model_path /path/origin_model \
14 | --gpus "0" \
15 | --infer_dtype "int8" \
16 | --model_source "llama2_chinese"
17 | ```
18 | 参数说明:
19 | - model_path 模型的本地路径
20 | - gpus 使用的显卡编号,类似"0"、 "0,1"
21 | - infer_dtype 模型加载后的参数数据类型,可以是 int8, float16
22 | - model_source 模型的源,可以是llama2_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分,如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama2_chinese。
23 |
24 |
25 | 默认部署在本地的 8001 端口,通过 POST 方法进行调用
26 |
27 | ```bash
28 | python accelerate_client.py
29 | ```
30 |
--------------------------------------------------------------------------------
/scripts/api/accelerate_client.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import json
3 | import time
4 | import urllib.request
5 | import sys
6 |
7 | def test_api_server(input_text):
8 | header = {'Content-Type': 'application/json'}
9 |
10 | data = {
11 | "system_prompt": "",
12 | "history": inputs,
13 | "n" : 1,
14 | "best_of": 1,
15 | "presence_penalty": 1.2,
16 | "frequency_penalty": 0.2,
17 | "temperature": 0.3,
18 | "top_p" : 0.95,
19 | "top_k": 50,
20 | "use_beam_search": False,
21 | "stop": [],
22 | "ignore_eos" :False,
23 | "logprobs": None,
24 | "max_new_tokens": 2048,
25 | }
26 | request = urllib.request.Request(
27 | url='http://127.0.0.1:8001/generate',
28 | headers=header,
29 | data=json.dumps(data).encode('utf-8')
30 | )
31 |
32 | result = None
33 | try:
34 | response = urllib.request.urlopen(request, timeout=300)
35 | res = response.read().decode('utf-8')
36 | result = json.loads(res)
37 | print(json.dumps(data, ensure_ascii=False, indent=2))
38 | print(json.dumps(result, ensure_ascii=False, indent=2))
39 |
40 | except Exception as e:
41 | print(e)
42 |
43 | return result
44 |
45 | if __name__ == "__main__":
46 |
47 | # 多伦对话测试
48 | """ 多伦对话测试
49 | last_question = "怎么回来呢"
50 | inputs = [{"role": "Human", "content": "如何去北京"},
51 | {"role": "Assitant", "content": "乘坐飞机或者轮船"},
52 | {"role" : "Human", "content": last_question}]
53 | """
54 | # 单轮对话
55 | last_question = "怎么去北京"
56 | inputs = [ {"role" : "Human", "content": last_question}]
57 |
58 | test_api_server(inputs)
59 |
60 |
--------------------------------------------------------------------------------
/scripts/api/accelerate_server.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import argparse
3 | import gc
4 | import math
5 | import os
6 | import time
7 |
8 | from fastapi import FastAPI, Request
9 | from transformers import AutoTokenizer, AutoModel
10 | import uvicorn, json, datetime
11 | import torch
12 | import torch.distributed as dist
13 |
14 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
15 |
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('--model_path',required=True,type=str)
18 | parser.add_argument('--gpus', default="0", type=str)
19 | parser.add_argument('--infer_dtype', default="int8", choices=["int4", "int8", "float16"], required=False,type=str)
20 | parser.add_argument('--model_source', default="llama2_chinese", choices =["llama2_chinese", "llama2_meta", "llama3_meta"], required=False,type=str)
21 |
22 | args = parser.parse_args()
23 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
24 |
25 | local_rank = int(os.getenv("LOCAL_RANK", "0"))
26 | world_size = torch.cuda.device_count()
27 |
28 | rank = local_rank
29 |
30 | app = FastAPI()
31 |
32 | def get_prompt_llama2chinese(
33 | chat_history, system_prompt=""
34 | ) -> str:
35 | prompt = ''
36 | for input_text_one in chat_history:
37 | prompt += ""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n"
38 | if chat_history[-1]['role']=='Human':
39 | prompt += "Assistant: "
40 | else:
41 | prompt += "Human: "
42 | prompt = prompt[-2048:]
43 | if len(system_prompt)>0:
44 | prompt = 'System: '+system_prompt.strip()+'\n'+prompt
45 |
46 | return prompt
47 |
48 | def get_prompt(chat_history, system_prompt=""):
49 | B_INST, E_INST = "[INST]", "[/INST]"
50 | B_SYS, E_SYS = "<>\n", "\n<>\n\n"
51 |
52 | sep = " "
53 | sep2 =" "
54 | stop_token_ids = [2]
55 | system_template = f"[INST] <>\n{system_prompt}\n<>\n\n"
56 | roles = ("[INST]", "[/INST]")
57 | seps = [sep, sep2]
58 | if system_prompt.strip() != "":
59 | ret = system_template
60 | else:
61 | ret = "[INST] "
62 | for i, chat in enumerate(chat_history):
63 | message = chat["content"]
64 | role = chat["role"]
65 | if message:
66 | if i == 0:
67 | ret += message + " "
68 | else:
69 | if role == "Human":
70 | ret += "[INST]" + " " + message + seps[i % 2]
71 | else:
72 | ret += "[/INST]" + " " + message + seps[i % 2]
73 | else:
74 | if role == "Human":
75 | ret += "[INST]"
76 | else:
77 | ret += "[/INST]"
78 | print("prompt:{}".format(ret))
79 | return ret
80 |
81 | def get_prompt_llama3(chat_history, system_prompt=""):
82 | system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
83 | user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>'
84 | assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n'
85 | prompt_str = ''
86 | # 拼接历史对话
87 | for item in chat_history:
88 | if item['role']=='Human':
89 | prompt_str+=user_format.format(content=item['content'])
90 | else:
91 | prompt_str+=assistant_format.format(content=item['content'])
92 | if len(system_prompt)>0:
93 | prompt_str = system_format.format(content=system_prompt) + prompt_str
94 | prompt_str = "<|begin_of_text|>" + prompt_str
95 | return prompt_str
96 |
97 |
98 | @app.post("/generate")
99 | async def create_item(request: Request):
100 | global model, tokenizer
101 | json_post_raw = await request.json()
102 | json_post = json.dumps(json_post_raw)
103 | json_post_list = json.loads(json_post)
104 | history = json_post_list.get('history')
105 | system_prompt = json_post_list.get('system_prompt')
106 | max_new_tokens = json_post_list.get('max_new_tokens')
107 | top_p = json_post_list.get('top_p')
108 | temperature = json_post_list.get('temperature')
109 |
110 | if args.model_source == "llama2_meta":
111 | prompt = get_prompt(history, system_prompt)
112 | elif args.model_source == "llama3_meta":
113 | prompt = get_prompt_llama3(history, system_prompt)
114 | else:
115 | prompt = get_prompt_llama2chinese(history, system_prompt)
116 |
117 | inputs = tokenizer([prompt], return_tensors='pt').to("cuda")
118 | generate_kwargs = dict(
119 | inputs,
120 | # streamer=streamer,
121 | max_new_tokens=max_new_tokens,
122 | do_sample=True,
123 | top_p=top_p,
124 | top_k=50,
125 | temperature=temperature,
126 | num_beams=1,
127 | repetition_penalty=1.2,
128 | max_length=2048,
129 | )
130 | generate_ids = model.generate(**generate_kwargs)
131 |
132 | generate_ids = [item[len(inputs[0]):-1] for item in generate_ids]
133 |
134 | bot_message = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
135 | if 'Human:' in bot_message:
136 | bot_message = bot_message.split('Human:')[0]
137 |
138 | now = datetime.datetime.now()
139 | time = now.strftime("%Y-%m-%d %H:%M:%S")
140 | answer = {
141 | "response": bot_message,
142 | "status": 200,
143 | "time": time
144 | }
145 | return answer
146 |
147 | def get_world_size() -> int:
148 | if dist.is_initialized():
149 | return dist.get_world_size()
150 | else:
151 | return 1
152 |
153 | def print_rank0(*msg):
154 | if rank != 0:
155 | return
156 | print(*msg)
157 |
158 |
159 | if __name__ == '__main__':
160 | dtype = torch.float16
161 | kwargs = dict(
162 | device_map="auto",
163 | )
164 | print("get_world_size:{}".format(get_world_size()))
165 |
166 | infer_dtype = args.infer_dtype
167 | if infer_dtype not in ["int4", "int8", "float16"]:
168 | raise ValueError("infer_dtype must one of int4, int8 or float16")
169 |
170 | if get_world_size() > 1:
171 | kwargs["device_map"] = "balanced_low_0"
172 |
173 | if infer_dtype == "int8":
174 | print_rank0("Using `load_in_8bit=True` to use quanitized model")
175 | kwargs["load_in_8bit"] = True
176 | else:
177 | kwargs["torch_dtype"] = dtype
178 |
179 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
180 | if infer_dtype in ["int8", "float16"]:
181 | model = AutoModelForCausalLM.from_pretrained(args.model_path, **kwargs,trust_remote_code=True,use_flash_attention_2=True)
182 | elif infer_dtype == "int4":
183 | from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
184 | model = AutoGPTQForCausalLM.from_quantized(
185 | args.model_path, device="cuda:0",
186 | use_triton=False,
187 | low_cpu_mem_usage=True,
188 | # inject_fused_attention=False,
189 | # inject_fused_mlp=False
190 | )
191 |
192 | model.eval()
193 | uvicorn.run(app, host='0.0.0.0', port=8001, workers=1)
194 |
--------------------------------------------------------------------------------
/scripts/convert2hf/README.md:
--------------------------------------------------------------------------------
1 | ## Meta官网模型权重转换成Hugging Face格式
2 |
3 | 使用脚本
4 | ```bash
5 | python convert_llama_weights_to_hf.py \
6 | --input_dir /path/to/downloaded/llama/weights \
7 | --model_size 7B \
8 | --output_dir /output/path
9 | ```
10 |
11 | 通过脚本转换后的模型权重可以使用transformers进行加载,例如:
12 |
13 | ```py
14 | from transformers import LlamaForCausalLM, LlamaTokenizer
15 |
16 | model = LlamaForCausalLM.from_pretrained("/output/path")
17 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
18 | ```
--------------------------------------------------------------------------------
/scripts/convert2hf/convert_llama_weights_to_hf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import argparse
15 | import gc
16 | import json
17 | import os
18 | import shutil
19 | import warnings
20 |
21 | import torch
22 |
23 | from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
24 |
25 |
26 | try:
27 | from transformers import LlamaTokenizerFast
28 | except ImportError as e:
29 | warnings.warn(e)
30 | warnings.warn(
31 | "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
32 | )
33 | LlamaTokenizerFast = None
34 |
35 | """
36 | Sample usage:
37 |
38 | ```
39 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \
40 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
41 | ```
42 |
43 | Thereafter, models can be loaded via:
44 |
45 | ```py
46 | from transformers import LlamaForCausalLM, LlamaTokenizer
47 |
48 | model = LlamaForCausalLM.from_pretrained("/output/path")
49 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
50 | ```
51 |
52 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
53 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
54 | """
55 |
56 | INTERMEDIATE_SIZE_MAP = {
57 | "7B": 11008,
58 | "13B": 13824,
59 | "30B": 17920,
60 | "65B": 22016,
61 | "70B": 28672,
62 | }
63 | NUM_SHARDS = {
64 | "7B": 1,
65 | "7Bf": 1,
66 | "13B": 2,
67 | "13Bf": 2,
68 | "30B": 4,
69 | "65B": 8,
70 | "70B": 8,
71 | "70Bf": 8,
72 | }
73 |
74 |
75 | def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
76 | return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
77 |
78 |
79 | def read_json(path):
80 | with open(path, "r") as f:
81 | return json.load(f)
82 |
83 |
84 | def write_json(text, path):
85 | with open(path, "w") as f:
86 | json.dump(text, f)
87 |
88 |
89 | def write_model(model_path, input_base_path, model_size, safe_serialization=True):
90 | os.makedirs(model_path, exist_ok=True)
91 | tmp_model_path = os.path.join(model_path, "tmp")
92 | os.makedirs(tmp_model_path, exist_ok=True)
93 |
94 | params = read_json(os.path.join(input_base_path, "params.json"))
95 | num_shards = NUM_SHARDS[model_size]
96 | n_layers = params["n_layers"]
97 | n_heads = params["n_heads"]
98 | n_heads_per_shard = n_heads // num_shards
99 | dim = params["dim"]
100 | dims_per_head = dim // n_heads
101 | base = 10000.0
102 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
103 |
104 | if "n_kv_heads" in params:
105 | num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
106 | num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
107 | key_value_dim = dim // num_key_value_heads
108 | else: # compatibility with other checkpoints
109 | num_key_value_heads = n_heads
110 | num_local_key_value_heads = n_heads_per_shard
111 | key_value_dim = dim
112 |
113 | # permute for sliced rotary
114 | def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
115 | return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
116 |
117 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
118 | # Load weights
119 | if model_size == "7B":
120 | # Not sharded
121 | # (The sharded implementation would also work, but this is simpler.)
122 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
123 | else:
124 | # Sharded
125 | loaded = [
126 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
127 | for i in range(num_shards)
128 | ]
129 | param_count = 0
130 | index_dict = {"weight_map": {}}
131 | for layer_i in range(n_layers):
132 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
133 | if model_size == "7B":
134 | # Unsharded
135 | state_dict = {
136 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
137 | loaded[f"layers.{layer_i}.attention.wq.weight"]
138 | ),
139 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
140 | loaded[f"layers.{layer_i}.attention.wk.weight"]
141 | ),
142 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
143 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
144 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
145 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
146 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
147 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
148 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
149 | }
150 | else:
151 | # Sharded
152 | # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
153 | # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
154 | # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
155 |
156 | state_dict = {
157 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
158 | f"layers.{layer_i}.attention_norm.weight"
159 | ].clone(),
160 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
161 | f"layers.{layer_i}.ffn_norm.weight"
162 | ].clone(),
163 | }
164 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
165 | torch.cat(
166 | [
167 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
168 | for i in range(num_shards)
169 | ],
170 | dim=0,
171 | ).reshape(dim, dim)
172 | )
173 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
174 | torch.cat(
175 | [
176 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
177 | num_local_key_value_heads, dims_per_head, dim
178 | )
179 | for i in range(num_shards)
180 | ],
181 | dim=0,
182 | ).reshape(key_value_dim, dim),
183 | num_key_value_heads,
184 | key_value_dim,
185 | dim,
186 | )
187 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
188 | [
189 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
190 | num_local_key_value_heads, dims_per_head, dim
191 | )
192 | for i in range(num_shards)
193 | ],
194 | dim=0,
195 | ).reshape(key_value_dim, dim)
196 |
197 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
198 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
199 | )
200 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
201 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
202 | )
203 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
204 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
205 | )
206 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
207 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
208 | )
209 |
210 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
211 | for k, v in state_dict.items():
212 | index_dict["weight_map"][k] = filename
213 | param_count += v.numel()
214 | torch.save(state_dict, os.path.join(tmp_model_path, filename))
215 |
216 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
217 | if model_size == "7B":
218 | # Unsharded
219 | state_dict = {
220 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
221 | "model.norm.weight": loaded["norm.weight"],
222 | "lm_head.weight": loaded["output.weight"],
223 | }
224 | else:
225 | state_dict = {
226 | "model.norm.weight": loaded[0]["norm.weight"],
227 | "model.embed_tokens.weight": torch.cat(
228 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
229 | ),
230 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
231 | }
232 |
233 | for k, v in state_dict.items():
234 | index_dict["weight_map"][k] = filename
235 | param_count += v.numel()
236 | torch.save(state_dict, os.path.join(tmp_model_path, filename))
237 |
238 | # Write configs
239 | index_dict["metadata"] = {"total_size": param_count * 2}
240 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
241 | ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
242 | multiple_of = params["multiple_of"] if "multiple_of" in params else 256
243 | config = LlamaConfig(
244 | hidden_size=dim,
245 | intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
246 | num_attention_heads=params["n_heads"],
247 | num_hidden_layers=params["n_layers"],
248 | rms_norm_eps=params["norm_eps"],
249 | num_key_value_heads=num_key_value_heads,
250 | )
251 | config.save_pretrained(tmp_model_path)
252 |
253 | # Make space so we can load the model properly now.
254 | del state_dict
255 | del loaded
256 | gc.collect()
257 |
258 | print("Loading the checkpoint in a Llama model.")
259 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
260 | # Avoid saving this as part of the config.
261 | del model.config._name_or_path
262 |
263 | print("Saving in the Transformers format.")
264 | model.save_pretrained(model_path, safe_serialization=safe_serialization)
265 | shutil.rmtree(tmp_model_path)
266 |
267 |
268 | def write_tokenizer(tokenizer_path, input_tokenizer_path):
269 | # Initialize the tokenizer based on the `spm` model
270 | tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
271 | print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
272 | tokenizer = tokenizer_class(input_tokenizer_path)
273 | tokenizer.save_pretrained(tokenizer_path)
274 |
275 |
276 | def main():
277 | parser = argparse.ArgumentParser()
278 | parser.add_argument(
279 | "--input_dir",
280 | help="Location of LLaMA weights, which contains tokenizer.model and model folders",
281 | )
282 | parser.add_argument(
283 | "--model_size",
284 | choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"],
285 | help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
286 | )
287 | parser.add_argument(
288 | "--output_dir",
289 | help="Location to write HF model and tokenizer",
290 | )
291 | parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
292 | args = parser.parse_args()
293 | if args.model_size != "tokenizer_only":
294 | write_model(
295 | model_path=args.output_dir,
296 | # input_base_path=os.path.join(args.input_dir, args.model_size),
297 | input_base_path=args.input_dir,
298 | model_size=args.model_size,
299 | safe_serialization=args.safe_serialization,
300 | )
301 | spm_path = os.path.join(args.input_dir, "tokenizer.model")
302 | write_tokenizer(args.output_dir, spm_path)
303 |
304 |
305 | if __name__ == "__main__":
306 | main()
307 |
308 |
--------------------------------------------------------------------------------
/scripts/test_model/test_pretrain_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
11 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
12 | "from transformers import AutoTokenizer,AutoModelForCausalLM\n",
13 | "import torch\n",
14 | "model = AutoModelForCausalLM.from_pretrained('/mnt/nvme3n1/model_public/Atom1B/checkpoint-480000',torch_dtype=torch.float16,device_map='auto',trust_remote_code=True)\n",
15 | "tokenizer = AutoTokenizer.from_pretrained('/mnt/nvme3n1/model_public/Atom1B/checkpoint-480000',use_fast=False)"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "input_ids = tokenizer(['''Human: 介绍一下北京\\nAssistant: '''], return_tensors=\"pt\",add_special_tokens=False).input_ids.to('cuda') \n",
25 | "print(input_ids) "
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "generate_input = {\n",
35 | " \"input_ids\":input_ids,\n",
36 | " \"max_new_tokens\":10,\n",
37 | " \"do_sample\":True,\n",
38 | " \"top_k\":50,\n",
39 | " \"top_p\":0.95,\n",
40 | " \"temperature\":1,\n",
41 | " \"repetition_penalty\":1.0,\n",
42 | " \"eos_token_id\":tokenizer.eos_token_id,\n",
43 | " \"bos_token_id\":tokenizer.bos_token_id,\n",
44 | " \"pad_token_id\":tokenizer.pad_token_id\n",
45 | "}\n",
46 | "generate_ids = model.generate(**generate_input)\n",
47 | "text = tokenizer.decode(generate_ids[0])\n",
48 | "print(text)"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "# checkpoint-100 的模型输出\n",
56 | "\n",
57 | "# checkpoint-5000 的模型输出\n"
58 | ]
59 | }
60 | ],
61 | "metadata": {
62 | "language_info": {
63 | "name": "python"
64 | }
65 | },
66 | "nbformat": 4,
67 | "nbformat_minor": 2
68 | }
69 |
--------------------------------------------------------------------------------
/train/merge_peft_model/merge.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python merge_peft_adapter.py \
2 | --adapter_model_name /checkpoint-2200 \
3 | --output_name checkpoint-2200_merge \
4 | --load8bit false \
5 | --tokenizer_fast false
--------------------------------------------------------------------------------
/train/merge_peft_model/merge_muilt.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=3 python merge_muilt_peft_adapter.py \
2 | --adapter_model_name checkpoint-8000 \
3 | checkpoint-140 \
4 | --output_name checkpoint-140-8000_merge
--------------------------------------------------------------------------------
/train/merge_peft_model/merge_muilt_peft_adapter.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional,List
3 |
4 | import peft
5 | import torch
6 | from peft import PeftConfig, PeftModel,PeftModelForSequenceClassification
7 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser,AutoModelForSequenceClassification
8 | from peft.utils import _get_submodules
9 |
10 | @dataclass
11 | class ScriptArguments:
12 | """
13 | The name of the Casual LM model we wish to fine with PPO
14 | """
15 |
16 | adapter_model_name: Optional[List[str]] = field(default=None, metadata={"help": "the model name"})
17 | output_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
18 |
19 |
20 | parser = HfArgumentParser(ScriptArguments)
21 | script_args = parser.parse_args_into_dataclasses()[0]
22 |
23 | base_model = None
24 | for one_lora_path in script_args.adapter_model_name:
25 | if base_model==None:
26 | peft_config = PeftConfig.from_pretrained(one_lora_path)
27 | tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
28 | tokenizer.save_pretrained(f"{script_args.output_name}")
29 | base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, torch_dtype=torch.bfloat16)
30 | peft_config = PeftConfig.from_pretrained(one_lora_path)
31 | base_model = PeftModel.from_pretrained(base_model, one_lora_path,device_map={"": 0})
32 | # model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, device_map='auto',load_in_8bit=True)
33 | # Load the Lora model
34 | base_model = base_model.merge_and_unload()
35 | base_model.eval()
36 |
37 |
38 |
39 |
40 | # key_list = [key for key, _ in model.base_model.model.named_modules() if "lora" not in key]
41 | # for key in key_list:
42 | # print(key)
43 | # parent, target, target_name = _get_submodules(model.base_model,key)
44 | # if isinstance(target, peft.tuners.lora.Linear):
45 | # print('peft.tuners.lora.Linear')
46 | # bias = target.bias is not None
47 | # new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
48 | # model.base_model._replace_module(parent, target_name, new_module, target)
49 |
50 | # model = model.base_model.model
51 |
52 |
53 | base_model.save_pretrained(f"{script_args.output_name}")
54 | # model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
--------------------------------------------------------------------------------
/train/merge_peft_model/merge_peft_adapter.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional
3 |
4 | import peft
5 | import torch
6 | from peft import PeftConfig, PeftModel,PeftModelForSequenceClassification
7 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser,AutoModelForSequenceClassification
8 | from peft.utils import _get_submodules
9 |
10 | @dataclass
11 | class ScriptArguments:
12 | """
13 | The name of the Casual LM model we wish to fine with PPO
14 | """
15 |
16 | adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
17 | load8bit : Optional[bool] = field(default=None, metadata={"help": "the model type"})
18 | output_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
19 | tokenizer_fast:Optional[bool] = field(default=None, metadata={"help": "the model type"})
20 |
21 |
22 | parser = HfArgumentParser(ScriptArguments)
23 | script_args = parser.parse_args_into_dataclasses()[0]
24 |
25 |
26 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
27 | model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16,device_map='auto',trust_remote_code=True)
28 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name,device_map='auto')
29 | tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path,use_fast=script_args.tokenizer_fast)
30 | config = AutoConfig.from_pretrained(peft_config.base_model_name_or_path)
31 | architecture = config.architectures[0]
32 | print(architecture)
33 | # Load the Lora model
34 | model = model.merge_and_unload()
35 | model.eval()
36 |
37 |
38 | model.save_pretrained(f"{script_args.output_name}")
39 | tokenizer.save_pretrained(f"{script_args.output_name}")
40 | if script_args.load8bit:
41 | model = AutoModelForCausalLM.from_pretrained(script_args.output_name, torch_dtype=torch.float16,load_in_8bit=script_args.load8bit,device_map='auto',trust_remote_code=True)
42 | model.save_pretrained(f"{script_args.output_name}",max_shard_size='5GB')
--------------------------------------------------------------------------------
/train/pretrain/accuracy.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Accuracy metric."""
15 |
16 | import datasets
17 | from sklearn.metrics import accuracy_score
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
24 | Accuracy = (TP + TN) / (TP + TN + FP + FN)
25 | Where:
26 | TP: True positive
27 | TN: True negative
28 | FP: False positive
29 | FN: False negative
30 | """
31 |
32 |
33 | _KWARGS_DESCRIPTION = """
34 | Args:
35 | predictions (`list` of `int`): Predicted labels.
36 | references (`list` of `int`): Ground truth labels.
37 | normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
38 | sample_weight (`list` of `float`): Sample weights Defaults to None.
39 |
40 | Returns:
41 | accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
42 |
43 | Examples:
44 |
45 | Example 1-A simple example
46 | >>> accuracy_metric = evaluate.load("accuracy")
47 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
48 | >>> print(results)
49 | {'accuracy': 0.5}
50 |
51 | Example 2-The same as Example 1, except with `normalize` set to `False`.
52 | >>> accuracy_metric = evaluate.load("accuracy")
53 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
54 | >>> print(results)
55 | {'accuracy': 3.0}
56 |
57 | Example 3-The same as Example 1, except with `sample_weight` set.
58 | >>> accuracy_metric = evaluate.load("accuracy")
59 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
60 | >>> print(results)
61 | {'accuracy': 0.8778625954198473}
62 | """
63 |
64 |
65 | _CITATION = """
66 | @article{scikit-learn,
67 | title={Scikit-learn: Machine Learning in {P}ython},
68 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
69 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
70 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
71 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
72 | journal={Journal of Machine Learning Research},
73 | volume={12},
74 | pages={2825--2830},
75 | year={2011}
76 | }
77 | """
78 |
79 |
80 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
81 | class Accuracy(evaluate.Metric):
82 | def _info(self):
83 | return evaluate.MetricInfo(
84 | description=_DESCRIPTION,
85 | citation=_CITATION,
86 | inputs_description=_KWARGS_DESCRIPTION,
87 | features=datasets.Features(
88 | {
89 | "predictions": datasets.Sequence(datasets.Value("int32")),
90 | "references": datasets.Sequence(datasets.Value("int32")),
91 | }
92 | if self.config_name == "multilabel"
93 | else {
94 | "predictions": datasets.Value("int32"),
95 | "references": datasets.Value("int32"),
96 | }
97 | ),
98 | reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
99 | )
100 |
101 | def _compute(self, predictions, references, normalize=True, sample_weight=None):
102 | return {
103 | "accuracy": float(
104 | accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
105 | )
106 | }
107 |
--------------------------------------------------------------------------------
/train/pretrain/ds_config_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "betas": "auto",
15 | "eps": "auto",
16 | "weight_decay": "auto"
17 | }
18 | },
19 |
20 | "scheduler": {
21 | "type": "WarmupDecayLR",
22 | "params": {
23 | "last_batch_iteration": -1,
24 | "total_num_steps": "auto",
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 |
31 | "zero_optimization": {
32 | "stage": 2,
33 | "offload_optimizer": {
34 | "device": "cpu",
35 | "pin_memory": true
36 | },
37 | "offload_param": {
38 | "device": "cpu",
39 | "pin_memory": true
40 | },
41 | "allgather_partitions": true,
42 | "allgather_bucket_size": 5e8,
43 | "overlap_comm": true,
44 | "reduce_scatter": true,
45 | "reduce_bucket_size": 5e8,
46 | "contiguous_gradients": true
47 | },
48 | "activation_checkpointing": {
49 | "partition_activations": false,
50 | "cpu_checkpointing": false,
51 | "contiguous_memory_optimization": false,
52 | "number_checkpoints": null,
53 | "synchronize_checkpoint_boundary": false,
54 | "profile": false
55 | },
56 | "gradient_accumulation_steps": "auto",
57 | "gradient_clipping": "auto",
58 | "steps_per_print": 2000,
59 | "train_batch_size": "auto",
60 | "min_lr": 5e-7,
61 | "train_micro_batch_size_per_gpu": "auto",
62 | "wall_clock_breakdown": false
63 | }
--------------------------------------------------------------------------------
/train/pretrain/ds_config_zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1,
9 | "fp16_opt_level": "O2"
10 | },
11 |
12 | "bf16": {
13 | "enabled": "auto"
14 | },
15 |
16 | "optimizer": {
17 | "type": "AdamW",
18 | "params": {
19 | "lr": "auto",
20 | "betas": "auto",
21 | "eps": "auto",
22 | "weight_decay": "auto"
23 | }
24 | },
25 |
26 | "scheduler": {
27 | "type": "WarmupDecayLR",
28 | "params": {
29 | "last_batch_iteration": -1,
30 | "total_num_steps": "auto",
31 | "warmup_min_lr": "auto",
32 | "warmup_max_lr": "auto",
33 | "warmup_num_steps": "auto"
34 | }
35 | },
36 |
37 | "zero_optimization": {
38 | "stage": 3,
39 | "overlap_comm": true,
40 | "contiguous_gradients": true,
41 | "sub_group_size": 1e9,
42 | "reduce_bucket_size": "auto",
43 | "stage3_prefetch_bucket_size": "auto",
44 | "stage3_param_persistence_threshold": "auto",
45 | "stage3_max_live_parameters": 1e9,
46 | "stage3_max_reuse_distance": 1e9,
47 | "gather_16bit_weights_on_model_save": true
48 | },
49 | "gradient_accumulation_steps": "auto",
50 | "gradient_clipping": "auto",
51 | "steps_per_print": 2000,
52 | "train_batch_size": "auto",
53 | "train_micro_batch_size_per_gpu": "auto",
54 | "wall_clock_breakdown": false
55 | }
--------------------------------------------------------------------------------
/train/pretrain/pretrain.sh:
--------------------------------------------------------------------------------
1 | output_model=output_model
2 | if [ ! -d ${output_model} ];then
3 | mkdir ${output_model}
4 | fi
5 | cp ./pretrain.sh ${output_model}
6 | cp ./ds_config_zero*.json ${output_model}
7 | export CUDA_HOME=/usr/local/cuda/
8 | export NCCL_P2P_DISABLE=1
9 |
10 | deepspeed --include localhost:0,2 pretrain_clm.py \
11 | --config_name ../../model_config/Atom-100M/config.json \
12 | --tokenizer_name ../../model_config/Atom-100M \
13 | --train_files ../../data/wiki_zh/train_lm_task_0.csv \
14 | ../../data/wiki_zh/train_lm_task_1.csv \
15 | --validation_files ../../data/wiki_zh/dev_lm_task.csv \
16 | --per_device_train_batch_size 32 \
17 | --per_device_eval_batch_size 32 \
18 | --do_train \
19 | --output_dir ${output_model} \
20 | --evaluation_strategy steps \
21 | --use_fast_tokenizer false \
22 | --max_eval_samples 500 \
23 | --learning_rate 1e-4 \
24 | --gradient_accumulation_steps 2 \
25 | --num_train_epochs 3 \
26 | --warmup_steps 5000 \
27 | --logging_dir ${output_model}/logs \
28 | --logging_strategy steps \
29 | --logging_steps 5 \
30 | --save_strategy steps \
31 | --preprocessing_num_workers 10 \
32 | --save_steps 100 \
33 | --eval_steps 5000000 \
34 | --save_total_limit 2000 \
35 | --seed 42 \
36 | --disable_tqdm false \
37 | --ddp_find_unused_parameters false \
38 | --block_size 1024 \
39 | --overwrite_output_dir \
40 | --report_to tensorboard \
41 | --run_name ${output_model} \
42 | --bf16 \
43 | --bf16_full_eval \
44 | --gradient_checkpointing \
45 | --deepspeed ./ds_config_zero3.json \
46 | --ignore_data_skip true \
47 | --ddp_timeout 18000000 \
48 | | tee -a ${output_model}/train.log
49 |
50 | # --resume_from_checkpoint ${output_model}/checkpoint-20400 \
51 |
--------------------------------------------------------------------------------
/train/pretrain/pretrain_clm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
19 | https://huggingface.co/models?filter=text-generation
20 | """
21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
22 |
23 | import logging
24 | import math
25 | import os
26 | import sys
27 | from dataclasses import dataclass, field
28 | from torchdata.datapipes.iter import IterDataPipe, IterableWrapper
29 | from itertools import chain
30 | import deepspeed
31 | from typing import Optional,List
32 |
33 | import datasets
34 | import pandas as pd
35 | import evaluate
36 | import torch
37 | from datasets import load_dataset
38 | from datasets.combine import interleave_datasets
39 | import transformers
40 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
41 | from transformers import (
42 | CONFIG_MAPPING,
43 | MODEL_FOR_CAUSAL_LM_MAPPING,
44 | AutoConfig,
45 | AutoModelForCausalLM,
46 | AutoTokenizer,
47 | TrainerCallback,
48 | TrainerState,
49 | TrainerControl,
50 | HfArgumentParser,
51 | Trainer,
52 | TrainingArguments,
53 | default_data_collator,
54 | is_torch_tpu_available,
55 | set_seed,
56 | )
57 | import datetime
58 | from transformers.testing_utils import CaptureLogger
59 | from transformers.trainer_utils import get_last_checkpoint
60 | from transformers.utils import check_min_version, send_example_telemetry
61 | from transformers.utils.versions import require_version
62 | from datasets import interleave_datasets
63 |
64 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
65 | # check_min_version("4.27.0.dev0")
66 |
67 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
68 |
69 | logger = logging.getLogger(__name__)
70 |
71 |
72 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
73 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
74 |
75 | @dataclass
76 | class ModelArguments:
77 | """
78 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
79 | """
80 |
81 | model_name_or_path: Optional[str] = field(
82 | default=None,
83 | metadata={
84 | "help": (
85 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
86 | )
87 | },
88 | )
89 | model_type: Optional[str] = field(
90 | default=None,
91 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
92 | )
93 | config_overrides: Optional[str] = field(
94 | default=None,
95 | metadata={
96 | "help": (
97 | "Override some existing default config settings when a model is trained from scratch. Example: "
98 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
99 | )
100 | },
101 | )
102 | config_name: Optional[str] = field(
103 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
104 | )
105 | tokenizer_name: Optional[str] = field(
106 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
107 | )
108 | cache_dir: Optional[str] = field(
109 | default=None,
110 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
111 | )
112 | use_fast_tokenizer: bool = field(
113 | default=True,
114 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
115 | )
116 | model_revision: str = field(
117 | default="main",
118 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
119 | )
120 | use_auth_token: bool = field(
121 | default=False,
122 | metadata={
123 | "help": (
124 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
125 | "with private models)."
126 | )
127 | },
128 | )
129 | torch_dtype: Optional[str] = field(
130 | default=None,
131 | metadata={
132 | "help": (
133 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
134 | "dtype will be automatically derived from the model's weights."
135 | ),
136 | "choices": ["auto", "bfloat16", "float16", "float32"],
137 | },
138 | )
139 |
140 | def __post_init__(self):
141 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
142 | raise ValueError(
143 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
144 | )
145 |
146 |
147 | @dataclass
148 | class DataTrainingArguments:
149 | """
150 | Arguments pertaining to what data we are going to input our model for training and eval.
151 | """
152 |
153 | dataset_name: Optional[str] = field(
154 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
155 | )
156 | dataset_config_name: Optional[str] = field(
157 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
158 | )
159 | train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."})
160 | validation_files: Optional[List[str]] = field(
161 | default=None,
162 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
163 | )
164 | max_train_samples: Optional[int] = field(
165 | default=None,
166 | metadata={
167 | "help": (
168 | "For debugging purposes or quicker training, truncate the number of training examples to this "
169 | "value if set."
170 | )
171 | },
172 | )
173 | max_eval_samples: Optional[int] = field(
174 | default=None,
175 | metadata={
176 | "help": (
177 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
178 | "value if set."
179 | )
180 | },
181 | )
182 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
183 | block_size: Optional[int] = field(
184 | default=None,
185 | metadata={
186 | "help": (
187 | "Optional input sequence length after tokenization. "
188 | "The training dataset will be truncated in block of this size for training. "
189 | "Default to the model max input length for single sentence inputs (take into account special tokens)."
190 | )
191 | },
192 | )
193 | overwrite_cache: bool = field(
194 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
195 | )
196 | validation_split_percentage: Optional[int] = field(
197 | default=5,
198 | metadata={
199 | "help": "The percentage of the train set used as validation set in case there's no validation split"
200 | },
201 | )
202 | preprocessing_num_workers: Optional[int] = field(
203 | default=None,
204 | metadata={"help": "The number of processes to use for the preprocessing."},
205 | )
206 | keep_linebreaks: bool = field(
207 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
208 | )
209 |
210 | def __post_init__(self):
211 | if self.streaming:
212 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
213 |
214 | if self.dataset_name is None and self.train_files is None and self.validation_files is None:
215 | raise ValueError("Need either a dataset name or a training/validation file.")
216 | else:
217 | if self.train_files is not None:
218 | extension = self.train_files[0].split(".")[-1]
219 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
220 | if self.validation_files is not None:
221 | extension = self.validation_files[0].split(".")[-1]
222 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
223 |
224 | def main():
225 | # See all possible arguments in src/transformers/training_args.py
226 | # or by passing the --help flag to this script.
227 | # We now keep distinct sets of args, for a cleaner separation of concerns.
228 |
229 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
230 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
231 | # If we pass only one argument to the script and it's the path to a json file,
232 | # let's parse it to get our arguments.
233 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
234 | else:
235 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
236 |
237 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
238 | # information sent is the one passed as arguments along with your Python/PyTorch versions.
239 | send_example_telemetry("run_clm", model_args, data_args)
240 |
241 | # Setup logging
242 | logging.basicConfig(
243 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
244 | datefmt="%m/%d/%Y %H:%M:%S",
245 | handlers=[logging.StreamHandler(sys.stdout)],
246 | )
247 |
248 | if training_args.should_log:
249 | # The default of training_args.log_level is passive, so we set log level at info here to have that default.
250 | transformers.utils.logging.set_verbosity_info()
251 |
252 | log_level = training_args.get_process_log_level()
253 | logger.setLevel(log_level)
254 | datasets.utils.logging.set_verbosity(log_level)
255 | transformers.utils.logging.set_verbosity(log_level)
256 | transformers.utils.logging.enable_default_handler()
257 | transformers.utils.logging.enable_explicit_format()
258 |
259 | # Log on each process the small summary:
260 | logger.warning(
261 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
262 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
263 | )
264 | logger.info(f"Training/evaluation parameters {training_args}")
265 |
266 | # Detecting last checkpoint.
267 | last_checkpoint = None
268 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
269 | last_checkpoint = get_last_checkpoint(training_args.output_dir)
270 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
271 | raise ValueError(
272 | f"Output directory ({training_args.output_dir}) already exists and is not empty. "
273 | "Use --overwrite_output_dir to overcome."
274 | )
275 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
276 | logger.info(
277 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
278 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
279 | )
280 |
281 | # Set seed before initializing model.
282 | set_seed(training_args.seed)
283 |
284 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
285 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
286 | # (the dataset will be downloaded automatically from the datasets Hub).
287 | #
288 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
289 | # 'text' is found. You can easily tweak this behavior (see below).
290 | #
291 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently
292 | # download the dataset.
293 | if True:
294 | data_files = {}
295 | dataset_args = {}
296 | if data_args.train_files is not None:
297 |
298 | print(data_args.train_files)
299 | data_files["train"] = data_args.train_files
300 | print('训练文件总个数',len(data_args.train_files))
301 | if data_args.validation_files is not None:
302 | data_files["validation"] = data_args.validation_files
303 | extension = (
304 | data_files["train"][0].split(".")[-1]
305 | if data_files["train"] is not None
306 | else data_args.validation_files.split(".")[-1]
307 | )
308 | if extension == "txt":
309 | extension = "text"
310 | dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
311 |
312 |
313 | raw_datasets = load_dataset(
314 | extension,
315 | data_files=data_files,
316 | streaming=data_args.streaming,
317 | cache_dir=os.path.join(training_args.output_dir,'dataset_cache'),
318 | use_auth_token=True if model_args.use_auth_token else None,
319 | **dataset_args,
320 | )
321 | if data_args.streaming:
322 | raw_datasets = raw_datasets.shuffle(seed=training_args.seed, buffer_size=1000000)
323 | # If no validation data is there, validation_split_percentage will be used to divide the dataset.
324 | if "validation" not in raw_datasets.keys():
325 | raw_datasets["validation"] = load_dataset(
326 | extension,
327 | data_files=data_files,
328 | split=f"train[:{data_args.validation_split_percentage}%]",
329 | cache_dir=model_args.cache_dir,
330 | use_auth_token=True if model_args.use_auth_token else None,
331 | **dataset_args,
332 | )
333 | raw_datasets["train"] = load_dataset(
334 | extension,
335 | data_files=data_files,
336 | split=f"train[{data_args.validation_split_percentage}%:]",
337 | cache_dir=model_args.cache_dir,
338 | use_auth_token=True if model_args.use_auth_token else None,
339 | **dataset_args,
340 | )
341 |
342 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
343 | # https://huggingface.co/docs/datasets/loading_datasets.html.
344 |
345 | # Load pretrained model and tokenizer
346 | #
347 | # Distributed training:
348 | # The .from_pretrained methods guarantee that only one local process can concurrently
349 | # download model & vocab.
350 |
351 | config_kwargs = {
352 | "cache_dir": model_args.cache_dir,
353 | "revision": model_args.model_revision,
354 | "use_auth_token": True if model_args.use_auth_token else None,
355 | }
356 | if model_args.config_name:
357 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
358 | elif model_args.model_name_or_path:
359 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
360 | else:
361 | config = CONFIG_MAPPING[model_args.model_type]()
362 | logger.warning("You are instantiating a new config instance from scratch.")
363 | if model_args.config_overrides is not None:
364 | logger.info(f"Overriding config: {model_args.config_overrides}")
365 | config.update_from_string(model_args.config_overrides)
366 | logger.info(f"New config: {config}")
367 |
368 | print(training_args.local_rank,'start load tokenizer')
369 | tokenizer_kwargs = {
370 | "cache_dir": model_args.cache_dir,
371 | "use_fast": model_args.use_fast_tokenizer,
372 | "revision": model_args.model_revision,
373 | "use_auth_token": True if model_args.use_auth_token else None,
374 | }
375 | if model_args.tokenizer_name:
376 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
377 | elif model_args.model_name_or_path:
378 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
379 | else:
380 | raise ValueError(
381 | "You are instantiating a new tokenizer from scratch. This is not supported by this script."
382 | "You can do it from another script, save it, and load it from here, using --tokenizer_name."
383 | )
384 | print(training_args.local_rank,'end load tokenizer')
385 | print(training_args.local_rank,'start load model')
386 | if model_args.model_name_or_path:
387 | torch_dtype = (
388 | model_args.torch_dtype
389 | if model_args.torch_dtype in ["auto", None]
390 | else getattr(torch, model_args.torch_dtype)
391 | )
392 | model = AutoModelForCausalLM.from_pretrained(
393 | model_args.model_name_or_path,
394 | from_tf=bool(".ckpt" in model_args.model_name_or_path),
395 | config=config,
396 | cache_dir=model_args.cache_dir,
397 | revision=model_args.model_revision,
398 | trust_remote_code=True,
399 | use_flash_attention_2=True,
400 | use_auth_token=True if model_args.use_auth_token else None,
401 | )
402 | else:
403 | model = AutoModelForCausalLM.from_config(config,trust_remote_code=True)
404 | n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
405 | logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
406 | print(training_args.local_rank,'end load model')
407 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
408 | # on a small vocab and want a smaller embedding size, remove this test.
409 | embedding_size = model.get_input_embeddings().weight.shape[0]
410 | if len(tokenizer) > embedding_size:
411 | model.resize_token_embeddings(len(tokenizer))
412 | # Preprocessing the datasets.
413 | # First we tokenize all the texts.
414 | if training_args.do_train:
415 | if data_args.streaming:
416 | dataset_head = raw_datasets["train"].take(3)
417 | print(list(dataset_head))
418 | column_names = list(list(dataset_head)[0].keys())
419 | else:
420 | column_names = list(raw_datasets["train"].features)
421 | else:
422 | if data_args.streaming:
423 | dataset_head = raw_datasets["validation"].take(3)
424 | column_names = list(list(dataset_head)[0].keys())
425 | else:
426 | column_names = list(raw_datasets["validation"].features)
427 | print(column_names)
428 | text_column_name = "text" if "text" in column_names else column_names[0]
429 |
430 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
431 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
432 |
433 | def tokenize_function(examples):
434 | with CaptureLogger(tok_logger) as cl:
435 | output = tokenizer( [ item for item in examples[text_column_name]])
436 | return output
437 |
438 | with training_args.main_process_first(desc="dataset map tokenization"):
439 | if not data_args.streaming:
440 | tokenized_datasets = raw_datasets.map(
441 | tokenize_function,
442 | batched=True,
443 | num_proc=data_args.preprocessing_num_workers,
444 | remove_columns=column_names,
445 | load_from_cache_file=not data_args.overwrite_cache,
446 | desc="Running tokenizer on dataset",
447 | )
448 | else:
449 | tokenized_datasets = raw_datasets.map(
450 | tokenize_function,
451 | batched=True,
452 | remove_columns=column_names,
453 | batch_size = 60000,
454 | )
455 |
456 | if data_args.block_size is None:
457 | block_size = tokenizer.model_max_length
458 | if block_size > 1024:
459 | logger.warning(
460 | "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
461 | " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
462 | " override this default with `--block_size xxx`."
463 | )
464 | block_size = 1024
465 | else:
466 | if data_args.block_size > tokenizer.model_max_length:
467 | logger.warning(
468 | f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
469 | f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
470 | )
471 | block_size = min(data_args.block_size, tokenizer.model_max_length)
472 |
473 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
474 | def group_texts(examples):
475 | # Concatenate all texts.
476 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
477 | # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
478 | total_length = len(concatenated_examples[list(examples.keys())[0]])
479 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
480 | # customize this part to your needs.
481 | if total_length >= block_size:
482 | total_length = (total_length // block_size) * block_size
483 | # Split by chunks of max_len.
484 | result = {
485 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
486 | for k, t in concatenated_examples.items()
487 | }
488 | # print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
489 | logger.info("group texts input examples length%d after_group size%d"%(len(examples['input_ids']),len(result["input_ids"])))
490 | result["labels"] = result["input_ids"].copy()
491 | return result
492 |
493 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
494 | # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
495 | # to preprocess.
496 | #
497 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
498 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
499 |
500 | with training_args.main_process_first(desc="grouping texts together"):
501 | if not data_args.streaming:
502 | lm_datasets = tokenized_datasets.map(
503 | group_texts,
504 | batched=True,
505 | num_proc=data_args.preprocessing_num_workers,
506 | load_from_cache_file=not data_args.overwrite_cache,
507 | desc=f"Grouping texts in chunks of {block_size}",
508 | batch_size = 40000,
509 | )
510 | else:
511 | lm_datasets = tokenized_datasets.map(
512 | group_texts,
513 | batched=True,
514 | batch_size = 60000,
515 | )
516 | print(training_args.local_rank,'start select train_dataset')
517 | if training_args.do_train:
518 | if "train" not in tokenized_datasets:
519 | raise ValueError("--do_train requires a train dataset")
520 | train_dataset = lm_datasets["train"]
521 | if data_args.max_train_samples is not None and data_args.streaming==False:
522 | max_train_samples = min(len(train_dataset), data_args.max_train_samples)
523 | train_dataset = train_dataset.select(range(max_train_samples))
524 | print(training_args.local_rank,'end select train_dataset')
525 |
526 | if training_args.do_eval:
527 | if "validation" not in tokenized_datasets:
528 | raise ValueError("--do_eval requires a validation dataset")
529 | print(training_args.local_rank,'start select eval_dataset')
530 | eval_dataset = lm_datasets["validation"]
531 | if data_args.max_eval_samples is not None and data_args.streaming==False :
532 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
533 | eval_dataset = eval_dataset.select(range(max_eval_samples))
534 | print(training_args.local_rank,'end select eval_dataset')
535 | def preprocess_logits_for_metrics(logits, labels):
536 | if isinstance(logits, tuple):
537 | # Depending on the model and config, logits may contain extra tensors,
538 | # like past_key_values, but logits always come first
539 | logits = logits[0]
540 | return logits.argmax(dim=-1)
541 | print(training_args.local_rank,'start load metric')
542 | metric = evaluate.load("accuracy.py")
543 | print(training_args.local_rank,'end load metric')
544 |
545 | def compute_metrics(eval_preds):
546 | preds, labels = eval_preds
547 | # preds have the same shape as the labels, after the argmax(-1) has been calculated
548 | # by preprocess_logits_for_metrics but we need to shift the labels
549 | labels = labels[:, 1:].reshape(-1)
550 | preds = preds[:, :-1].reshape(-1)
551 | return metric.compute(predictions=preds, references=labels)
552 |
553 | print(training_args.local_rank,'Initialize our Trainer')
554 | trainer = Trainer(
555 | model=model,
556 | args=training_args,
557 | train_dataset= IterableWrapper(train_dataset) if training_args.do_train else None,
558 | eval_dataset= IterableWrapper(eval_dataset) if training_args.do_eval else None,
559 | tokenizer=tokenizer,
560 | # Data collator will default to DataCollatorWithPadding, so we change it.
561 | data_collator=default_data_collator,
562 | compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
563 | preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None,
564 | # callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None),
565 | )
566 |
567 | if training_args.do_train:
568 | checkpoint = None
569 | if training_args.resume_from_checkpoint is not None:
570 | checkpoint = training_args.resume_from_checkpoint
571 | elif last_checkpoint is not None:
572 | checkpoint = last_checkpoint
573 |
574 | print(training_args.local_rank,'start train')
575 | train_result = trainer.train(resume_from_checkpoint=checkpoint)
576 | trainer.save_model() # Saves the tokenizer too for easy upload
577 |
578 | metrics = train_result.metrics
579 |
580 | max_train_samples = (
581 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
582 | )
583 | metrics["train_samples"] = min(max_train_samples, len(train_dataset))
584 |
585 | trainer.log_metrics("train", metrics)
586 | trainer.save_metrics("train", metrics)
587 | trainer.save_state()
588 |
589 | # Evaluation
590 | if training_args.do_eval:
591 | logger.info("*** Evaluate ***")
592 |
593 | metrics = trainer.evaluate()
594 |
595 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
596 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
597 | try:
598 | perplexity = math.exp(metrics["eval_loss"])
599 | except OverflowError:
600 | perplexity = float("inf")
601 | metrics["perplexity"] = perplexity
602 |
603 | trainer.log_metrics("eval", metrics)
604 | trainer.save_metrics("eval", metrics)
605 |
606 |
607 |
608 | def _mp_fn(index):
609 | # For xla_spawn (TPUs)
610 | main()
611 |
612 |
613 | if __name__ == "__main__":
614 | main()
615 |
--------------------------------------------------------------------------------
/train/sft/accuracy.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Accuracy metric."""
15 |
16 | import datasets
17 | from sklearn.metrics import accuracy_score
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
24 | Accuracy = (TP + TN) / (TP + TN + FP + FN)
25 | Where:
26 | TP: True positive
27 | TN: True negative
28 | FP: False positive
29 | FN: False negative
30 | """
31 |
32 |
33 | _KWARGS_DESCRIPTION = """
34 | Args:
35 | predictions (`list` of `int`): Predicted labels.
36 | references (`list` of `int`): Ground truth labels.
37 | normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
38 | sample_weight (`list` of `float`): Sample weights Defaults to None.
39 |
40 | Returns:
41 | accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
42 |
43 | Examples:
44 |
45 | Example 1-A simple example
46 | >>> accuracy_metric = evaluate.load("accuracy")
47 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
48 | >>> print(results)
49 | {'accuracy': 0.5}
50 |
51 | Example 2-The same as Example 1, except with `normalize` set to `False`.
52 | >>> accuracy_metric = evaluate.load("accuracy")
53 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
54 | >>> print(results)
55 | {'accuracy': 3.0}
56 |
57 | Example 3-The same as Example 1, except with `sample_weight` set.
58 | >>> accuracy_metric = evaluate.load("accuracy")
59 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
60 | >>> print(results)
61 | {'accuracy': 0.8778625954198473}
62 | """
63 |
64 |
65 | _CITATION = """
66 | @article{scikit-learn,
67 | title={Scikit-learn: Machine Learning in {P}ython},
68 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
69 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
70 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
71 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
72 | journal={Journal of Machine Learning Research},
73 | volume={12},
74 | pages={2825--2830},
75 | year={2011}
76 | }
77 | """
78 |
79 |
80 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
81 | class Accuracy(evaluate.Metric):
82 | def _info(self):
83 | return evaluate.MetricInfo(
84 | description=_DESCRIPTION,
85 | citation=_CITATION,
86 | inputs_description=_KWARGS_DESCRIPTION,
87 | features=datasets.Features(
88 | {
89 | "predictions": datasets.Sequence(datasets.Value("int32")),
90 | "references": datasets.Sequence(datasets.Value("int32")),
91 | }
92 | if self.config_name == "multilabel"
93 | else {
94 | "predictions": datasets.Value("int32"),
95 | "references": datasets.Value("int32"),
96 | }
97 | ),
98 | reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
99 | )
100 |
101 | def _compute(self, predictions, references, normalize=True, sample_weight=None):
102 | return {
103 | "accuracy": float(
104 | accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
105 | )
106 | }
107 |
--------------------------------------------------------------------------------
/train/sft/ds_config_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "betas": "auto",
15 | "eps": "auto",
16 | "weight_decay": "auto"
17 | }
18 | },
19 |
20 | "scheduler": {
21 | "type": "WarmupDecayLR",
22 | "params": {
23 | "last_batch_iteration": -1,
24 | "total_num_steps": "auto",
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 |
31 | "zero_optimization": {
32 | "stage": 2,
33 | "offload_optimizer": {
34 | "device": "cpu",
35 | "pin_memory": true
36 | },
37 | "offload_param": {
38 | "device": "cpu",
39 | "pin_memory": true
40 | },
41 | "allgather_partitions": true,
42 | "allgather_bucket_size": 5e8,
43 | "overlap_comm": true,
44 | "reduce_scatter": true,
45 | "reduce_bucket_size": 5e8,
46 | "contiguous_gradients": true
47 | },
48 | "activation_checkpointing": {
49 | "partition_activations": false,
50 | "cpu_checkpointing": false,
51 | "contiguous_memory_optimization": false,
52 | "number_checkpoints": null,
53 | "synchronize_checkpoint_boundary": false,
54 | "profile": false
55 | },
56 | "gradient_accumulation_steps": "auto",
57 | "gradient_clipping": "auto",
58 | "steps_per_print": 2000,
59 | "train_batch_size": "auto",
60 | "min_lr": 5e-7,
61 | "train_micro_batch_size_per_gpu": "auto",
62 | "wall_clock_breakdown": false
63 | }
--------------------------------------------------------------------------------
/train/sft/finetune.sh:
--------------------------------------------------------------------------------
1 | output_model=save_folder
2 | # 需要修改到自己的输入目录
3 | if [ ! -d ${output_model} ];then
4 | mkdir ${output_model}
5 | fi
6 | cp ./finetune.sh ${output_model}
7 | deepspeed --include localhost:1,0 finetune_clm.py \
8 | --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
9 | --train_files ../../data/train_sft.csv \
10 | --validation_files ../../data/dev_sft.csv \
11 | ../../data/dev_sft_sharegpt.csv \
12 | --per_device_train_batch_size 1 \
13 | --per_device_eval_batch_size 1 \
14 | --do_train \
15 | --do_eval \
16 | --use_fast_tokenizer false \
17 | --output_dir ${output_model} \
18 | --evaluation_strategy steps \
19 | --max_eval_samples 800 \
20 | --learning_rate 1e-4 \
21 | --gradient_accumulation_steps 8 \
22 | --num_train_epochs 10 \
23 | --warmup_steps 400 \
24 | --logging_dir ${output_model}/logs \
25 | --logging_strategy steps \
26 | --logging_steps 10 \
27 | --save_strategy steps \
28 | --preprocessing_num_workers 10 \
29 | --save_steps 20 \
30 | --eval_steps 20 \
31 | --save_total_limit 2000 \
32 | --seed 42 \
33 | --disable_tqdm false \
34 | --ddp_find_unused_parameters false \
35 | --block_size 2048 \
36 | --report_to tensorboard \
37 | --overwrite_output_dir \
38 | --deepspeed ds_config_zero2.json \
39 | --ignore_data_skip true \
40 | --bf16 \
41 | --gradient_checkpointing \
42 | --bf16_full_eval \
43 | --ddp_timeout 18000000 \
44 | | tee -a ${output_model}/train.log
45 |
46 |
47 |
48 | # --resume_from_checkpoint ${output_model}/checkpoint-20400 \
49 |
--------------------------------------------------------------------------------
/train/sft/finetune_clm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
19 | https://huggingface.co/models?filter=text-generation
20 | """
21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
22 |
23 | import logging
24 | import math
25 | import os
26 | import sys
27 | import random
28 | from dataclasses import dataclass, field
29 | from itertools import chain
30 | import deepspeed
31 | from typing import Optional,List,Union
32 |
33 | import datasets
34 | import evaluate
35 | import torch
36 | from datasets import load_dataset
37 | from peft import ( # noqa: E402
38 | LoraConfig,
39 | PeftModel,
40 | get_peft_model,
41 | get_peft_model_state_dict,
42 | prepare_model_for_int8_training,
43 | prepare_model_for_kbit_training,
44 | set_peft_model_state_dict,
45 | )
46 | import transformers
47 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
48 | from transformers import (
49 | CONFIG_MAPPING,
50 | MODEL_FOR_CAUSAL_LM_MAPPING,
51 | AutoConfig,
52 | AutoModelForCausalLM,
53 | AutoTokenizer,
54 | TrainerCallback,
55 | TrainerState,
56 | TrainerControl,
57 | HfArgumentParser,
58 | Trainer,
59 | TrainingArguments,
60 | default_data_collator,
61 | BitsAndBytesConfig,
62 | is_torch_tpu_available,
63 | set_seed,
64 | )
65 | from transformers.testing_utils import CaptureLogger
66 | from transformers.trainer_utils import get_last_checkpoint
67 | from transformers.utils import check_min_version, send_example_telemetry
68 | from transformers.utils.versions import require_version
69 |
70 | import pdb
71 |
72 |
73 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
74 | # check_min_version("4.27.0.dev0")
75 |
76 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
77 |
78 | logger = logging.getLogger(__name__)
79 |
80 |
81 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
82 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
83 |
84 |
85 | @dataclass
86 | class ModelArguments:
87 | """
88 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
89 | """
90 |
91 | model_name_or_path: Optional[str] = field(
92 | default=None,
93 | metadata={
94 | "help": (
95 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
96 | )
97 | },
98 | )
99 | model_type: Optional[str] = field(
100 | default=None,
101 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
102 | )
103 | config_overrides: Optional[str] = field(
104 | default=None,
105 | metadata={
106 | "help": (
107 | "Override some existing default config settings when a model is trained from scratch. Example: "
108 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
109 | )
110 | },
111 | )
112 | config_name: Optional[str] = field(
113 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
114 | )
115 | tokenizer_name: Optional[str] = field(
116 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
117 | )
118 | cache_dir: Optional[str] = field(
119 | default=None,
120 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
121 | )
122 | use_fast_tokenizer: bool = field(
123 | default=True,
124 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
125 | )
126 | model_revision: str = field(
127 | default="main",
128 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
129 | )
130 | use_auth_token: bool = field(
131 | default=False,
132 | metadata={
133 | "help": (
134 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
135 | "with private models)."
136 | )
137 | },
138 | )
139 |
140 | torch_dtype: Optional[str] = field(
141 | default=None,
142 | metadata={
143 | "help": (
144 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
145 | "dtype will be automatically derived from the model's weights."
146 | ),
147 | "choices": ["auto", "bfloat16", "float16", "float32"],
148 | },
149 | )
150 |
151 | def __post_init__(self):
152 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
153 | raise ValueError(
154 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
155 | )
156 |
157 |
158 |
159 | @dataclass
160 | class DataTrainingArguments:
161 | """
162 | Arguments pertaining to what data we are going to input our model for training and eval.
163 | """
164 | train_on_inputs: bool = field(
165 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
166 | )
167 | dataset_name: Optional[str] = field(
168 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
169 | )
170 | dataset_config_name: Optional[str] = field(
171 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
172 | )
173 | train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."})
174 | validation_files: Optional[List[str]] = field(
175 | default=None,
176 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
177 | )
178 | max_train_samples: Optional[int] = field(
179 | default=None,
180 | metadata={
181 | "help": (
182 | "For debugging purposes or quicker training, truncate the number of training examples to this "
183 | "value if set."
184 | )
185 | },
186 | )
187 | max_eval_samples: Optional[int] = field(
188 | default=None,
189 | metadata={
190 | "help": (
191 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
192 | "value if set."
193 | )
194 | },
195 | )
196 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
197 | block_size: Optional[int] = field(
198 | default=None,
199 | metadata={
200 | "help": (
201 | "Optional input sequence length after tokenization. "
202 | "The training dataset will be truncated in block of this size for training. "
203 | "Default to the model max input length for single sentence inputs (take into account special tokens)."
204 | )
205 | },
206 | )
207 | overwrite_cache: bool = field(
208 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
209 | )
210 | validation_split_percentage: Optional[int] = field(
211 | default=5,
212 | metadata={
213 | "help": "The percentage of the train set used as validation set in case there's no validation split"
214 | },
215 | )
216 | preprocessing_num_workers: Optional[int] = field(
217 | default=None,
218 | metadata={"help": "The number of processes to use for the preprocessing."},
219 | )
220 | keep_linebreaks: bool = field(
221 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
222 | )
223 |
224 | def __post_init__(self):
225 | if self.streaming:
226 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
227 |
228 | if self.dataset_name is None and self.train_files is None and self.validation_files is None:
229 | raise ValueError("Need either a dataset name or a training/validation file.")
230 | else:
231 | if self.train_files is not None:
232 | extension = self.train_files[0].split(".")[-1]
233 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
234 | if self.validation_files is not None:
235 | extension = self.validation_files[0].split(".")[-1]
236 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
237 |
238 | def main():
239 | # See all possible arguments in src/transformers/training_args.py
240 | # or by passing the --help flag to this script.
241 | # We now keep distinct sets of args, for a cleaner separation of concerns.
242 |
243 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
244 | # pdb.set_trace()
245 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
246 | # If we pass only one argument to the script and it's the path to a json file,
247 | # let's parse it to get our arguments.
248 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
249 | else:
250 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
251 |
252 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
253 | # information sent is the one passed as arguments along with your Python/PyTorch versions.
254 | send_example_telemetry("run_clm", model_args, data_args)
255 |
256 | # Setup logging
257 | logging.basicConfig(
258 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
259 | datefmt="%m/%d/%Y %H:%M:%S",
260 | handlers=[logging.StreamHandler(sys.stdout)],
261 | )
262 |
263 | if training_args.should_log:
264 | # The default of training_args.log_level is passive, so we set log level at info here to have that default.
265 | transformers.utils.logging.set_verbosity_info()
266 |
267 | log_level = training_args.get_process_log_level()
268 | logger.setLevel(log_level)
269 | datasets.utils.logging.set_verbosity(log_level)
270 | transformers.utils.logging.set_verbosity(log_level)
271 | transformers.utils.logging.enable_default_handler()
272 | transformers.utils.logging.enable_explicit_format()
273 |
274 | # Log on each process the small summary:
275 | logger.warning(
276 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
277 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
278 | )
279 | logger.info(f"Training/evaluation parameters {training_args}")
280 |
281 | # Detecting last checkpoint.
282 | last_checkpoint = None
283 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
284 | last_checkpoint = get_last_checkpoint(training_args.output_dir)
285 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
286 | raise ValueError(
287 | f"Output directory ({training_args.output_dir}) already exists and is not empty. "
288 | "Use --overwrite_output_dir to overcome."
289 | )
290 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
291 | logger.info(
292 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
293 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
294 | )
295 |
296 | # Set seed before initializing model.
297 | set_seed(training_args.seed)
298 |
299 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
300 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
301 | # (the dataset will be downloaded automatically from the datasets Hub).
302 | #
303 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
304 | # 'text' is found. You can easily tweak this behavior (see below).
305 | #
306 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently
307 | # download the dataset.
308 | if True:
309 | data_files = {}
310 | dataset_args = {}
311 | if data_args.train_files is not None:
312 | data_files["train"] = data_args.train_files
313 | if data_args.validation_files is not None:
314 | data_files["validation"] = data_args.validation_files
315 | extension = (
316 | data_args.train_files[0].split(".")[-1]
317 | if data_args.train_files is not None
318 | else data_args.validation_files.split(".")[-1]
319 | )
320 | if extension == "txt":
321 | extension = "text"
322 | dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
323 | raw_datasets = load_dataset(
324 | extension,
325 | data_files=data_files,
326 | cache_dir=os.path.join(training_args.output_dir,'dataset_cache'),
327 | use_auth_token=True if model_args.use_auth_token else None,
328 | **dataset_args,
329 | )
330 | # If no validation data is there, validation_split_percentage will be used to divide the dataset.
331 | if "validation" not in raw_datasets.keys():
332 | raw_datasets["validation"] = load_dataset(
333 | extension,
334 | data_files=data_files,
335 | split=f"train[:{data_args.validation_split_percentage}%]",
336 | cache_dir=model_args.cache_dir,
337 | use_auth_token=True if model_args.use_auth_token else None,
338 | **dataset_args,
339 | )
340 | raw_datasets["train"] = load_dataset(
341 | extension,
342 | data_files=data_files,
343 | split=f"train[{data_args.validation_split_percentage}%:]",
344 | cache_dir=model_args.cache_dir,
345 | use_auth_token=True if model_args.use_auth_token else None,
346 | **dataset_args,
347 | )
348 |
349 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
350 | # https://huggingface.co/docs/datasets/loading_datasets.html.
351 |
352 | # Load pretrained model and tokenizer
353 | #
354 | # Distributed training:
355 | # The .from_pretrained methods guarantee that only one local process can concurrently
356 | # download model & vocab.
357 |
358 | config_kwargs = {
359 | "cache_dir": model_args.cache_dir,
360 | "revision": model_args.model_revision,
361 | "use_auth_token": True if model_args.use_auth_token else None,
362 | }
363 | if model_args.config_name:
364 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
365 | elif model_args.model_name_or_path:
366 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
367 | else:
368 | config = CONFIG_MAPPING[model_args.model_type]()
369 | logger.warning("You are instantiating a new config instance from scratch.")
370 | if model_args.config_overrides is not None:
371 | logger.info(f"Overriding config: {model_args.config_overrides}")
372 | config.update_from_string(model_args.config_overrides)
373 | logger.info(f"New config: {config}")
374 |
375 | tokenizer_kwargs = {
376 | "cache_dir": model_args.cache_dir,
377 | "use_fast": model_args.use_fast_tokenizer,
378 | "revision": model_args.model_revision,
379 | "use_auth_token": True if model_args.use_auth_token else None,
380 | "padding_side":'left'
381 | }
382 | if model_args.tokenizer_name:
383 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
384 | elif model_args.model_name_or_path:
385 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
386 | else:
387 | raise ValueError(
388 | "You are instantiating a new tokenizer from scratch. This is not supported by this script."
389 | "You can do it from another script, save it, and load it from here, using --tokenizer_name."
390 | )
391 | tokenizer.pad_token = tokenizer.eos_token
392 | if model_args.model_name_or_path:
393 | torch_dtype = (
394 | model_args.torch_dtype
395 | if model_args.torch_dtype in ["auto", None]
396 | else getattr(torch, model_args.torch_dtype)
397 | )
398 | print(torch_dtype)
399 | torch_dtype = torch.float16
400 | model = AutoModelForCausalLM.from_pretrained(
401 | model_args.model_name_or_path,
402 | from_tf=bool(".ckpt" in model_args.model_name_or_path),
403 | config=config,
404 | cache_dir=model_args.cache_dir,
405 | revision=model_args.model_revision,
406 | use_auth_token=True if model_args.use_auth_token else None,
407 | torch_dtype=torch_dtype,
408 | trust_remote_code=True,
409 | use_flash_attention_2=True,
410 | device_map={"": int(os.environ.get("LOCAL_RANK") or 0)}
411 | )
412 | # model = prepare_model_for_int8_training(model, output_embedding_layer_name="embed_out", layer_norm_names=[])
413 |
414 | else:
415 | model = AutoModelForCausalLM.from_config(config)
416 | n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
417 | logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
418 |
419 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
420 | # on a small vocab and want a smaller embedding size, remove this test.
421 | embedding_size = model.get_input_embeddings().weight.shape[0]
422 | if len(tokenizer) > embedding_size:
423 | model.resize_token_embeddings(len(tokenizer))
424 |
425 | # Preprocessing the datasets.
426 | # First we tokenize all the texts.
427 | if training_args.do_train:
428 | column_names = list(raw_datasets["train"].features)
429 | else:
430 | column_names = list(raw_datasets["validation"].features)
431 |
432 | train_on_inputs = True
433 | if len(column_names)==1:
434 | text_column_name = "text" if "text" in column_names else column_names[0]
435 | elif len(column_names)==2:
436 | input_column_name = 'input' if 'input' in column_names else column_names[0]
437 | target_column_name = 'target' if 'target' in column_names else column_names[0]
438 | train_on_inputs=False
439 | else:
440 | raise ValueError('输入文件列数不对')
441 | print('train_on_inputs',train_on_inputs)
442 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
443 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
444 |
445 | def tokenize_function(examples):
446 | with CaptureLogger(tok_logger) as cl:
447 | output = tokenizer([ item for item in examples[text_column_name]],truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
448 | output['labels'] = output['input_ids'].copy()
449 | return output
450 |
451 | def tokenize(prompt):
452 | result = tokenizer(prompt,truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
453 | result["labels"] = result["input_ids"].copy()
454 | return result
455 |
456 | def generate_and_tokenize_prompt(data_point):
457 | input_text = data_point[input_column_name]
458 | target_text = data_point[target_column_name]
459 | full_prompt = input_text+target_text
460 | tokenized_full_prompt = tokenize(full_prompt)
461 | if not train_on_inputs:
462 | user_prompt = input_text
463 | tokenized_user_prompt = tokenize(user_prompt)
464 | user_prompt_len = len(tokenized_user_prompt["input_ids"])
465 | tokenized_full_prompt["labels"] = [
466 | -100
467 | ] * user_prompt_len + tokenized_full_prompt["labels"][
468 | user_prompt_len:
469 | ]
470 | return tokenized_full_prompt
471 |
472 |
473 |
474 | with training_args.main_process_first(desc="dataset map tokenization"):
475 | if not data_args.streaming:
476 | tokenized_datasets = raw_datasets.map(
477 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
478 | batched=True if train_on_inputs==True else False,
479 | num_proc=data_args.preprocessing_num_workers,
480 | remove_columns=column_names,
481 | load_from_cache_file=not data_args.overwrite_cache,
482 | desc="Running tokenizer on dataset",
483 | )
484 | else:
485 | tokenized_datasets = raw_datasets.map(
486 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
487 | batched=True if train_on_inputs==True else False,
488 | remove_columns=column_names,
489 | )
490 |
491 | if data_args.block_size is None:
492 | block_size = tokenizer.model_max_length
493 | if block_size > 2048:
494 | block_size = 2048
495 | else:
496 | block_size = min(data_args.block_size, tokenizer.model_max_length)
497 |
498 | if training_args.do_train:
499 | if "train" not in tokenized_datasets:
500 | raise ValueError("--do_train requires a train dataset")
501 | train_dataset = tokenized_datasets["train"]
502 | if data_args.max_train_samples is not None:
503 | max_train_samples = min(len(train_dataset), data_args.max_train_samples)
504 | train_dataset = train_dataset.select(range(max_train_samples))
505 | for index in random.sample(range(len(train_dataset)), 3):
506 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
507 | train_dataset = train_dataset.shuffle(seed=training_args.seed)
508 |
509 | if training_args.do_eval:
510 | if "validation" not in tokenized_datasets:
511 | raise ValueError("--do_eval requires a validation dataset")
512 | eval_dataset = tokenized_datasets["validation"]
513 | if data_args.max_eval_samples is not None:
514 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
515 | eval_dataset = eval_dataset.select(range(max_eval_samples))
516 |
517 | def preprocess_logits_for_metrics(logits, labels):
518 | if isinstance(logits, tuple):
519 | # Depending on the model and config, logits may contain extra tensors,
520 | # like past_key_values, but logits always come first
521 | logits = logits[0]
522 | return logits.argmax(dim=-1)
523 |
524 | metric = evaluate.load("accuracy.py")
525 |
526 | def compute_metrics(eval_preds):
527 | preds, labels = eval_preds
528 | # preds have the same shape as the labels, after the argmax(-1) has been calculated
529 | # by preprocess_logits_for_metrics but we need to shift the labels
530 | labels = labels[:, 1:].reshape(-1)
531 | # .reshape(-1)
532 | preds = preds[:, :-1].reshape(-1)
533 | # .reshape(-1)
534 | # print(labels.shape)
535 | # true_predictions = [
536 | # [p for (p, l) in zip(pred, gold_label) if l != -100]
537 | # for pred, gold_label in zip(preds, labels)
538 | # ]
539 | # true_labels = [
540 | # [l for (p, l) in zip(pred, gold_label) if l != -100]
541 | # for pred, gold_label in zip(preds, labels)
542 | # ]
543 | # preds = np.array(true_predictions).reshape(-1)
544 | # labels = np.array(true_labels).reshape(-1)
545 | return metric.compute(predictions=preds, references=labels)
546 |
547 | # Initialize our Trainer
548 | trainer = Trainer(
549 | model=model,
550 | args=training_args,
551 | train_dataset=train_dataset if training_args.do_train else None,
552 | eval_dataset=eval_dataset if training_args.do_eval else None,
553 | tokenizer=tokenizer,
554 | # Data collator will default to DataCollatorWithPadding, so we change it.
555 | data_collator=transformers.DataCollatorForSeq2Seq(
556 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
557 | ),
558 | compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
559 | preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None,
560 | )
561 |
562 | # Training
563 | if training_args.do_train:
564 | checkpoint = None
565 | if training_args.resume_from_checkpoint is not None:
566 | checkpoint = training_args.resume_from_checkpoint
567 | elif last_checkpoint is not None:
568 | checkpoint = last_checkpoint
569 |
570 | print(training_args.local_rank,'start train')
571 |
572 | if torch.__version__ >= "2" and sys.platform != "win32":
573 | model = torch.compile(model)
574 |
575 | train_result = trainer.train(resume_from_checkpoint=checkpoint)
576 | trainer.save_model() # Saves the tokenizer too for easy upload
577 |
578 | metrics = train_result.metrics
579 |
580 | max_train_samples = (
581 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
582 | )
583 | metrics["train_samples"] = min(max_train_samples, len(train_dataset))
584 |
585 | trainer.log_metrics("train", metrics)
586 | trainer.save_metrics("train", metrics)
587 | trainer.save_state()
588 |
589 | # Evaluation
590 | if training_args.do_eval:
591 | logger.info("*** Evaluate ***")
592 |
593 | metrics = trainer.evaluate()
594 |
595 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
596 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
597 | try:
598 | perplexity = math.exp(metrics["eval_loss"])
599 | except OverflowError:
600 | perplexity = float("inf")
601 | metrics["perplexity"] = perplexity
602 |
603 | trainer.log_metrics("eval", metrics)
604 | trainer.save_metrics("eval", metrics)
605 |
606 |
607 |
608 | def _mp_fn(index):
609 | # For xla_spawn (TPUs)
610 | main()
611 |
612 |
613 | if __name__ == "__main__":
614 | main()
615 |
--------------------------------------------------------------------------------
/train/sft/finetune_clm_lora.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
19 | https://huggingface.co/models?filter=text-generation
20 | """
21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
22 |
23 | import logging
24 | import math
25 | import os
26 | import sys
27 | import random
28 | from dataclasses import dataclass, field
29 | from itertools import chain
30 | import deepspeed
31 | from typing import Optional,List,Union
32 |
33 | import datasets
34 | import evaluate
35 | import torch
36 | from datasets import load_dataset
37 | from peft import ( # noqa: E402
38 | LoraConfig,
39 | PeftModel,
40 | get_peft_model,
41 | get_peft_model_state_dict,
42 | prepare_model_for_int8_training,
43 | prepare_model_for_kbit_training,
44 | set_peft_model_state_dict,
45 | )
46 | import transformers
47 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
48 | from transformers import (
49 | CONFIG_MAPPING,
50 | MODEL_FOR_CAUSAL_LM_MAPPING,
51 | AutoConfig,
52 | AutoModelForCausalLM,
53 | AutoTokenizer,
54 | TrainerCallback,
55 | TrainerState,
56 | TrainerControl,
57 | HfArgumentParser,
58 | Trainer,
59 | TrainingArguments,
60 | default_data_collator,
61 | BitsAndBytesConfig,
62 | is_torch_tpu_available,
63 | set_seed,
64 | )
65 | from transformers.testing_utils import CaptureLogger
66 | from transformers.trainer_utils import get_last_checkpoint
67 | from transformers.utils import check_min_version, send_example_telemetry
68 | from transformers.utils.versions import require_version
69 |
70 | import pdb
71 |
72 |
73 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
74 | # check_min_version("4.27.0.dev0")
75 |
76 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
77 |
78 | logger = logging.getLogger(__name__)
79 |
80 |
81 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
82 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
83 |
84 |
85 | @dataclass
86 | class ModelArguments:
87 | """
88 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
89 | """
90 |
91 | model_name_or_path: Optional[str] = field(
92 | default=None,
93 | metadata={
94 | "help": (
95 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
96 | )
97 | },
98 | )
99 | model_type: Optional[str] = field(
100 | default=None,
101 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
102 | )
103 | config_overrides: Optional[str] = field(
104 | default=None,
105 | metadata={
106 | "help": (
107 | "Override some existing default config settings when a model is trained from scratch. Example: "
108 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
109 | )
110 | },
111 | )
112 | config_name: Optional[str] = field(
113 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
114 | )
115 | tokenizer_name: Optional[str] = field(
116 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
117 | )
118 | cache_dir: Optional[str] = field(
119 | default=None,
120 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
121 | )
122 | lora_r: Optional[int] = field(default=16)
123 | lora_alpha: Optional[int] = field(default=32)
124 | target_modules: Optional[str] = field(
125 | default='q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj',
126 | metadata={
127 | "help": "List of module names or regex expression of the module names to replace with Lora."
128 | "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
129 | },
130 | )
131 | use_fast_tokenizer: bool = field(
132 | default=True,
133 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
134 | )
135 | load_in_bits: Optional[int] = field(default=8)
136 | model_revision: str = field(
137 | default="main",
138 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
139 | )
140 | use_auth_token: bool = field(
141 | default=False,
142 | metadata={
143 | "help": (
144 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
145 | "with private models)."
146 | )
147 | },
148 | )
149 |
150 | torch_dtype: Optional[str] = field(
151 | default=None,
152 | metadata={
153 | "help": (
154 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
155 | "dtype will be automatically derived from the model's weights."
156 | ),
157 | "choices": ["auto", "bfloat16", "float16", "float32"],
158 | },
159 | )
160 |
161 | def __post_init__(self):
162 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
163 | raise ValueError(
164 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
165 | )
166 | if type(self.target_modules)==str:
167 | self.target_modules = self.target_modules.split(',')
168 |
169 |
170 | @dataclass
171 | class DataTrainingArguments:
172 | """
173 | Arguments pertaining to what data we are going to input our model for training and eval.
174 | """
175 | train_on_inputs: bool = field(
176 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
177 | )
178 | dataset_name: Optional[str] = field(
179 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
180 | )
181 | dataset_config_name: Optional[str] = field(
182 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
183 | )
184 | train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."})
185 | validation_files: Optional[List[str]] = field(
186 | default=None,
187 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
188 | )
189 | max_train_samples: Optional[int] = field(
190 | default=None,
191 | metadata={
192 | "help": (
193 | "For debugging purposes or quicker training, truncate the number of training examples to this "
194 | "value if set."
195 | )
196 | },
197 | )
198 | max_eval_samples: Optional[int] = field(
199 | default=None,
200 | metadata={
201 | "help": (
202 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
203 | "value if set."
204 | )
205 | },
206 | )
207 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
208 | block_size: Optional[int] = field(
209 | default=None,
210 | metadata={
211 | "help": (
212 | "Optional input sequence length after tokenization. "
213 | "The training dataset will be truncated in block of this size for training. "
214 | "Default to the model max input length for single sentence inputs (take into account special tokens)."
215 | )
216 | },
217 | )
218 | overwrite_cache: bool = field(
219 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
220 | )
221 | validation_split_percentage: Optional[int] = field(
222 | default=5,
223 | metadata={
224 | "help": "The percentage of the train set used as validation set in case there's no validation split"
225 | },
226 | )
227 | preprocessing_num_workers: Optional[int] = field(
228 | default=None,
229 | metadata={"help": "The number of processes to use for the preprocessing."},
230 | )
231 | keep_linebreaks: bool = field(
232 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
233 | )
234 |
235 | def __post_init__(self):
236 | if self.streaming:
237 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
238 |
239 | if self.dataset_name is None and self.train_files is None and self.validation_files is None:
240 | raise ValueError("Need either a dataset name or a training/validation file.")
241 | else:
242 | if self.train_files is not None:
243 | extension = self.train_files[0].split(".")[-1]
244 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
245 | if self.validation_files is not None:
246 | extension = self.validation_files[0].split(".")[-1]
247 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
248 |
249 | class SavePeftModelCallback(TrainerCallback):
250 | def on_save(
251 | self,
252 | args: TrainingArguments,
253 | state: TrainerState,
254 | control: TrainerControl,
255 | **kwargs,
256 | ):
257 | if state.is_world_process_zero:
258 | print('+++++++++++++++++save call back++++++++++++++++')
259 | checkpoint_folder = os.path.join(
260 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
261 | )
262 | kwargs["model"].save_pretrained(checkpoint_folder)
263 |
264 | pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
265 | if os.path.exists(pytorch_model_path):
266 | os.remove(pytorch_model_path)
267 | return control
268 |
269 | def main():
270 | # See all possible arguments in src/transformers/training_args.py
271 | # or by passing the --help flag to this script.
272 | # We now keep distinct sets of args, for a cleaner separation of concerns.
273 |
274 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
275 | # pdb.set_trace()
276 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
277 | # If we pass only one argument to the script and it's the path to a json file,
278 | # let's parse it to get our arguments.
279 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
280 | else:
281 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
282 |
283 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
284 | # information sent is the one passed as arguments along with your Python/PyTorch versions.
285 | send_example_telemetry("run_clm", model_args, data_args)
286 |
287 | # Setup logging
288 | logging.basicConfig(
289 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
290 | datefmt="%m/%d/%Y %H:%M:%S",
291 | handlers=[logging.StreamHandler(sys.stdout)],
292 | )
293 |
294 | if training_args.should_log:
295 | # The default of training_args.log_level is passive, so we set log level at info here to have that default.
296 | transformers.utils.logging.set_verbosity_info()
297 |
298 | log_level = training_args.get_process_log_level()
299 | logger.setLevel(log_level)
300 | datasets.utils.logging.set_verbosity(log_level)
301 | transformers.utils.logging.set_verbosity(log_level)
302 | transformers.utils.logging.enable_default_handler()
303 | transformers.utils.logging.enable_explicit_format()
304 |
305 | # Log on each process the small summary:
306 | logger.warning(
307 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
308 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
309 | )
310 | logger.info(f"Training/evaluation parameters {training_args}")
311 |
312 | # Detecting last checkpoint.
313 | last_checkpoint = None
314 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
315 | last_checkpoint = get_last_checkpoint(training_args.output_dir)
316 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
317 | raise ValueError(
318 | f"Output directory ({training_args.output_dir}) already exists and is not empty. "
319 | "Use --overwrite_output_dir to overcome."
320 | )
321 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
322 | logger.info(
323 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
324 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
325 | )
326 |
327 | # Set seed before initializing model.
328 | set_seed(training_args.seed)
329 |
330 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
331 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
332 | # (the dataset will be downloaded automatically from the datasets Hub).
333 | #
334 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
335 | # 'text' is found. You can easily tweak this behavior (see below).
336 | #
337 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently
338 | # download the dataset.
339 | if True:
340 | data_files = {}
341 | dataset_args = {}
342 | if data_args.train_files is not None:
343 | data_files["train"] = data_args.train_files
344 | if data_args.validation_files is not None:
345 | data_files["validation"] = data_args.validation_files
346 | extension = (
347 | data_args.train_files[0].split(".")[-1]
348 | if data_args.train_files is not None
349 | else data_args.validation_files.split(".")[-1]
350 | )
351 | if extension == "txt":
352 | extension = "text"
353 | dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
354 | raw_datasets = load_dataset(
355 | extension,
356 | data_files=data_files,
357 | cache_dir=os.path.join(training_args.output_dir,'dataset_cache'),
358 | use_auth_token=True if model_args.use_auth_token else None,
359 | **dataset_args,
360 | )
361 | # If no validation data is there, validation_split_percentage will be used to divide the dataset.
362 | if "validation" not in raw_datasets.keys():
363 | raw_datasets["validation"] = load_dataset(
364 | extension,
365 | data_files=data_files,
366 | split=f"train[:{data_args.validation_split_percentage}%]",
367 | cache_dir=model_args.cache_dir,
368 | use_auth_token=True if model_args.use_auth_token else None,
369 | **dataset_args,
370 | )
371 | raw_datasets["train"] = load_dataset(
372 | extension,
373 | data_files=data_files,
374 | split=f"train[{data_args.validation_split_percentage}%:]",
375 | cache_dir=model_args.cache_dir,
376 | use_auth_token=True if model_args.use_auth_token else None,
377 | **dataset_args,
378 | )
379 |
380 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
381 | # https://huggingface.co/docs/datasets/loading_datasets.html.
382 |
383 | # Load pretrained model and tokenizer
384 | #
385 | # Distributed training:
386 | # The .from_pretrained methods guarantee that only one local process can concurrently
387 | # download model & vocab.
388 |
389 | config_kwargs = {
390 | "cache_dir": model_args.cache_dir,
391 | "revision": model_args.model_revision,
392 | "use_auth_token": True if model_args.use_auth_token else None,
393 | }
394 | if model_args.config_name:
395 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
396 | elif model_args.model_name_or_path:
397 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
398 | else:
399 | config = CONFIG_MAPPING[model_args.model_type]()
400 | logger.warning("You are instantiating a new config instance from scratch.")
401 | if model_args.config_overrides is not None:
402 | logger.info(f"Overriding config: {model_args.config_overrides}")
403 | config.update_from_string(model_args.config_overrides)
404 | logger.info(f"New config: {config}")
405 |
406 | tokenizer_kwargs = {
407 | "cache_dir": model_args.cache_dir,
408 | "use_fast": model_args.use_fast_tokenizer,
409 | "revision": model_args.model_revision,
410 | "use_auth_token": True if model_args.use_auth_token else None,
411 | "padding_side":'left'
412 | }
413 | if model_args.tokenizer_name:
414 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
415 | elif model_args.model_name_or_path:
416 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
417 | else:
418 | raise ValueError(
419 | "You are instantiating a new tokenizer from scratch. This is not supported by this script."
420 | "You can do it from another script, save it, and load it from here, using --tokenizer_name."
421 | )
422 | tokenizer.pad_token = tokenizer.eos_token
423 | lora_config = LoraConfig(
424 | r=model_args.lora_r,
425 | lora_alpha=model_args.lora_alpha,
426 | # target_modules=["query_key_value"],
427 | # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
428 | target_modules = model_args.target_modules,
429 | fan_in_fan_out = False,
430 | lora_dropout=0.05,
431 | inference_mode=False,
432 | bias="none",
433 | task_type="CAUSAL_LM",
434 | )
435 | print(lora_config)
436 | bnb_config = BitsAndBytesConfig(
437 | load_in_4bit=True,
438 | bnb_4bit_use_double_quant=True,
439 | bnb_4bit_quant_type="nf4",
440 | bnb_4bit_compute_dtype=torch.bfloat16
441 | )
442 | if model_args.model_name_or_path:
443 | torch_dtype = (
444 | model_args.torch_dtype
445 | if model_args.torch_dtype in ["auto", None]
446 | else getattr(torch, model_args.torch_dtype)
447 | )
448 | print(torch_dtype)
449 | torch_dtype = torch.float16
450 | model = AutoModelForCausalLM.from_pretrained(
451 | model_args.model_name_or_path,
452 | from_tf=bool(".ckpt" in model_args.model_name_or_path),
453 | config=config,
454 | cache_dir=model_args.cache_dir,
455 | revision=model_args.model_revision,
456 | use_auth_token=True if model_args.use_auth_token else None,
457 | torch_dtype=torch_dtype,
458 | load_in_8bit=True if model_args.load_in_bits==8 else False,
459 | trust_remote_code=True,
460 | use_flash_attention_2=True,
461 | quantization_config=bnb_config if model_args.load_in_bits==4 else None,
462 | # device_map = 'auto'
463 | device_map={"": int(os.environ.get("LOCAL_RANK") or 0)}
464 | )
465 | # model = prepare_model_for_int8_training(model, output_embedding_layer_name="embed_out", layer_norm_names=[])
466 |
467 | else:
468 | model = AutoModelForCausalLM.from_config(config)
469 | n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
470 | logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
471 |
472 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
473 | # on a small vocab and want a smaller embedding size, remove this test.
474 | embedding_size = model.get_input_embeddings().weight.shape[0]
475 | if len(tokenizer) > embedding_size:
476 | model.resize_token_embeddings(len(tokenizer))
477 | if model_args.load_in_bits==8:
478 | model = prepare_model_for_int8_training(model)
479 | elif model_args.load_in_bits==4:
480 | model = prepare_model_for_kbit_training(model)
481 |
482 | # Preprocessing the datasets.
483 | # First we tokenize all the texts.
484 | if training_args.do_train:
485 | column_names = list(raw_datasets["train"].features)
486 | else:
487 | column_names = list(raw_datasets["validation"].features)
488 |
489 | train_on_inputs = True
490 | if len(column_names)==1:
491 | text_column_name = "text" if "text" in column_names else column_names[0]
492 | elif len(column_names)==2:
493 | input_column_name = 'input' if 'input' in column_names else column_names[0]
494 | target_column_name = 'target' if 'target' in column_names else column_names[0]
495 | train_on_inputs=False
496 | else:
497 | raise ValueError('输入文件列数不对')
498 | print('train_on_inputs',train_on_inputs)
499 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
500 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
501 |
502 | def tokenize_function(examples):
503 | with CaptureLogger(tok_logger) as cl:
504 | output = tokenizer([ item for item in examples[text_column_name]],truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
505 | output['labels'] = output['input_ids'].copy()
506 | return output
507 |
508 | def tokenize(prompt):
509 | result = tokenizer(prompt,truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
510 | result["labels"] = result["input_ids"].copy()
511 | return result
512 |
513 | def generate_and_tokenize_prompt(data_point):
514 | input_text = data_point[input_column_name]
515 | target_text = data_point[target_column_name]
516 | full_prompt = input_text+target_text
517 | tokenized_full_prompt = tokenize(full_prompt)
518 | if not train_on_inputs:
519 | user_prompt = input_text
520 | tokenized_user_prompt = tokenize(user_prompt)
521 | user_prompt_len = len(tokenized_user_prompt["input_ids"])
522 | tokenized_full_prompt["labels"] = [
523 | -100
524 | ] * user_prompt_len + tokenized_full_prompt["labels"][
525 | user_prompt_len:
526 | ]
527 | return tokenized_full_prompt
528 |
529 |
530 |
531 | with training_args.main_process_first(desc="dataset map tokenization"):
532 | if not data_args.streaming:
533 | tokenized_datasets = raw_datasets.map(
534 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
535 | batched=True if train_on_inputs==True else False,
536 | num_proc=data_args.preprocessing_num_workers,
537 | remove_columns=column_names,
538 | load_from_cache_file=not data_args.overwrite_cache,
539 | desc="Running tokenizer on dataset",
540 | )
541 | else:
542 | tokenized_datasets = raw_datasets.map(
543 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
544 | batched=True if train_on_inputs==True else False,
545 | remove_columns=column_names,
546 | )
547 |
548 | if data_args.block_size is None:
549 | block_size = tokenizer.model_max_length
550 | if block_size > 2048:
551 | block_size = 2048
552 | else:
553 | block_size = min(data_args.block_size, tokenizer.model_max_length)
554 |
555 | if training_args.do_train:
556 | if "train" not in tokenized_datasets:
557 | raise ValueError("--do_train requires a train dataset")
558 | train_dataset = tokenized_datasets["train"]
559 | if data_args.max_train_samples is not None:
560 | max_train_samples = min(len(train_dataset), data_args.max_train_samples)
561 | train_dataset = train_dataset.select(range(max_train_samples))
562 | for index in random.sample(range(len(train_dataset)), 3):
563 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
564 | train_dataset = train_dataset.shuffle(seed=training_args.seed)
565 |
566 | if training_args.do_eval:
567 | if "validation" not in tokenized_datasets:
568 | raise ValueError("--do_eval requires a validation dataset")
569 | eval_dataset = tokenized_datasets["validation"]
570 | if data_args.max_eval_samples is not None:
571 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
572 | eval_dataset = eval_dataset.select(range(max_eval_samples))
573 |
574 | def preprocess_logits_for_metrics(logits, labels):
575 | if isinstance(logits, tuple):
576 | # Depending on the model and config, logits may contain extra tensors,
577 | # like past_key_values, but logits always come first
578 | logits = logits[0]
579 | return logits.argmax(dim=-1)
580 |
581 | metric = evaluate.load("accuracy.py")
582 |
583 | def compute_metrics(eval_preds):
584 | preds, labels = eval_preds
585 | # preds have the same shape as the labels, after the argmax(-1) has been calculated
586 | # by preprocess_logits_for_metrics but we need to shift the labels
587 | labels = labels[:, 1:].reshape(-1)
588 | # .reshape(-1)
589 | preds = preds[:, :-1].reshape(-1)
590 | # .reshape(-1)
591 | # print(labels.shape)
592 | # true_predictions = [
593 | # [p for (p, l) in zip(pred, gold_label) if l != -100]
594 | # for pred, gold_label in zip(preds, labels)
595 | # ]
596 | # true_labels = [
597 | # [l for (p, l) in zip(pred, gold_label) if l != -100]
598 | # for pred, gold_label in zip(preds, labels)
599 | # ]
600 | # preds = np.array(true_predictions).reshape(-1)
601 | # labels = np.array(true_labels).reshape(-1)
602 | return metric.compute(predictions=preds, references=labels)
603 | # layer_norm_names=[]
604 |
605 |
606 |
607 | model = get_peft_model(model, lora_config)
608 | model.print_trainable_parameters()
609 |
610 | # Initialize our Trainer
611 | trainer = Trainer(
612 | model=model,
613 | args=training_args,
614 | train_dataset=train_dataset if training_args.do_train else None,
615 | eval_dataset=eval_dataset if training_args.do_eval else None,
616 | tokenizer=tokenizer,
617 | # Data collator will default to DataCollatorWithPadding, so we change it.
618 | data_collator=transformers.DataCollatorForSeq2Seq(
619 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
620 | ),
621 | compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
622 | preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None,
623 | callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None),
624 | )
625 |
626 | # Training
627 | if training_args.do_train:
628 | checkpoint = None
629 | '''if training_args.resume_from_checkpoint is not None:
630 | resume_from_checkpoint = training_args.resume_from_checkpoint
631 | checkpoint_name = os.path.join(resume_from_checkpoint, "pytorch_model.bin")
632 | if not os.path.exists(checkpoint_name):
633 | checkpoint_name = os.path.join(
634 | resume_from_checkpoint, "adapter_model.bin"
635 | ) # only LoRA model - LoRA config above has to fit
636 | resume_from_checkpoint = (
637 | False # So the trainer won't try loading its state
638 | )
639 | # The two files above have a different name depending on how they were saved, but are actually the same.
640 | if os.path.exists(checkpoint_name):
641 | print(f"Restarting from {checkpoint_name}")
642 | adapters_weights = torch.load(checkpoint_name)
643 | set_peft_model_state_dict(model, adapters_weights)
644 | else:
645 | print(f"Checkpoint {checkpoint_name} not found")
646 | # checkpoint = Fa'''
647 | if training_args.resume_from_checkpoint is not None:
648 | checkpoint = training_args.resume_from_checkpoint
649 | elif last_checkpoint is not None:
650 | checkpoint = last_checkpoint
651 |
652 | if torch.__version__ >= "2" and sys.platform != "win32":
653 | model = torch.compile(model)
654 |
655 | train_result = trainer.train(resume_from_checkpoint=checkpoint)
656 | trainer.save_model() # Saves the tokenizer too for easy upload
657 |
658 | metrics = train_result.metrics
659 |
660 | max_train_samples = (
661 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
662 | )
663 | metrics["train_samples"] = min(max_train_samples, len(train_dataset))
664 |
665 | trainer.log_metrics("train", metrics)
666 | trainer.save_metrics("train", metrics)
667 | trainer.save_state()
668 |
669 | # Evaluation
670 | if training_args.do_eval:
671 | logger.info("*** Evaluate ***")
672 |
673 | metrics = trainer.evaluate()
674 |
675 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
676 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
677 | try:
678 | perplexity = math.exp(metrics["eval_loss"])
679 | except OverflowError:
680 | perplexity = float("inf")
681 | metrics["perplexity"] = perplexity
682 |
683 | trainer.log_metrics("eval", metrics)
684 | trainer.save_metrics("eval", metrics)
685 |
686 |
687 |
688 | def _mp_fn(index):
689 | # For xla_spawn (TPUs)
690 | main()
691 |
692 |
693 | if __name__ == "__main__":
694 | main()
695 |
--------------------------------------------------------------------------------
/train/sft/finetune_lora.sh:
--------------------------------------------------------------------------------
1 | output_model=save_folder
2 | # 需要修改到自己的输入目录
3 | if [ ! -d ${output_model} ];then
4 | mkdir ${output_model}
5 | fi
6 | export CUDA_HOME=/usr/local/cuda/
7 | export NCCL_P2P_DISABLE=1
8 | cp ./finetune.sh ${output_model}
9 | deepspeed --include localhost:1,0 finetune_clm_lora.py \
10 | --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
11 | --train_files ../../data/train_sft.csv \
12 | --validation_files ../../data/dev_sft.csv \
13 | ../../data/dev_sft_sharegpt.csv \
14 | --per_device_train_batch_size 1 \
15 | --per_device_eval_batch_size 1 \
16 | --do_train \
17 | --do_eval \
18 | --use_fast_tokenizer false \
19 | --output_dir ${output_model} \
20 | --evaluation_strategy steps \
21 | --max_eval_samples 800 \
22 | --learning_rate 1e-4 \
23 | --gradient_accumulation_steps 8 \
24 | --num_train_epochs 10 \
25 | --warmup_steps 400 \
26 | --load_in_bits 4 \
27 | --lora_r 8 \
28 | --lora_alpha 32 \
29 | --target_modules q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj \
30 | --logging_dir ${output_model}/logs \
31 | --logging_strategy steps \
32 | --logging_steps 10 \
33 | --save_strategy steps \
34 | --preprocessing_num_workers 10 \
35 | --save_steps 20 \
36 | --eval_steps 20 \
37 | --save_total_limit 2000 \
38 | --seed 42 \
39 | --disable_tqdm false \
40 | --ddp_find_unused_parameters false \
41 | --block_size 2048 \
42 | --report_to tensorboard \
43 | --overwrite_output_dir \
44 | --deepspeed ds_config_zero2.json \
45 | --ignore_data_skip true \
46 | --bf16 \
47 | --gradient_checkpointing \
48 | --bf16_full_eval \
49 | --ddp_timeout 18000000 \
50 | | tee -a ${output_model}/train.log
51 |
52 |
53 |
54 | # --resume_from_checkpoint ${output_model}/checkpoint-20400 \
55 |
--------------------------------------------------------------------------------