├── README.md ├── README_EN.md ├── assets ├── Llama4-Maverick.png ├── base_eval.png ├── ceval.jpg ├── llama.jpg ├── llama.png ├── llama2-chinese-webui.jpg ├── llama3_eval.png ├── llama_eval.jpeg ├── meta_eval_13B.md ├── meta_eval_7B.md ├── tuned_eval.png ├── wechat-new.jpeg └── wechat.jpeg ├── data ├── dev_sft.csv ├── dev_sft_sharegpt.csv └── train_sft.csv ├── docker ├── Dockerfile ├── Dockerfile_train └── docker-compose.yml ├── docs ├── chat_gradio_guide.md └── inference_speed_guide.md ├── examples ├── chat_gradio.py ├── chat_gradio_no_merge.py └── llama2_for_langchain.py ├── inference-speed ├── CPU │ └── ggml │ │ └── README.md └── GPU │ ├── FasterTransformer_example │ └── README.md │ ├── JittorLLMs_example │ └── README.md │ ├── TensorRT-LLM_example │ ├── README.md │ ├── atom_inference.py │ └── utils.py │ ├── lmdeploy_example │ ├── README.md │ └── test_api_server.py │ └── vllm_example │ ├── README.md │ ├── api_server.py │ ├── client_test.py │ ├── multi_gpus_api_server.sh │ └── single_gpu_api_server.sh ├── requirements.txt ├── scripts ├── api │ ├── README.md │ ├── accelerate_client.py │ └── accelerate_server.py ├── convert2hf │ ├── README.md │ └── convert_llama_weights_to_hf.py └── test_model │ └── test_pretrain_model.ipynb └── train ├── merge_peft_model ├── merge.sh ├── merge_muilt.sh ├── merge_muilt_peft_adapter.py └── merge_peft_adapter.py ├── pretrain ├── accuracy.py ├── ds_config_zero2.json ├── ds_config_zero3.json ├── pretrain.sh └── pretrain_clm.py └── sft ├── accuracy.py ├── ds_config_zero2.json ├── finetune.sh ├── finetune_clm.py ├── finetune_clm_lora.py └── finetune_lora.sh /assets/Llama4-Maverick.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/Llama4-Maverick.png -------------------------------------------------------------------------------- /assets/base_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/base_eval.png -------------------------------------------------------------------------------- /assets/ceval.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/ceval.jpg -------------------------------------------------------------------------------- /assets/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama.jpg -------------------------------------------------------------------------------- /assets/llama.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama.png -------------------------------------------------------------------------------- /assets/llama2-chinese-webui.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama2-chinese-webui.jpg -------------------------------------------------------------------------------- /assets/llama3_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama3_eval.png -------------------------------------------------------------------------------- /assets/llama_eval.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama_eval.jpeg -------------------------------------------------------------------------------- /assets/tuned_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/tuned_eval.png -------------------------------------------------------------------------------- /assets/wechat-new.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/wechat-new.jpeg -------------------------------------------------------------------------------- /assets/wechat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/wechat.jpeg -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # 使用pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel作为基础镜像 2 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel 3 | 4 | RUN apt-get update -y --allow-unauthenticated 5 | RUN apt install -y git vim git-lfs 6 | 7 | #设置工作目录 8 | WORKDIR /root/Llama-Chinese 9 | 10 | # 从git上克隆llama-chinese仓库 11 | RUN git clone https://github.com/LlamaFamily/Llama-Chinese.git /root/Llama-Chinese 12 | 13 | # tsinghua source 14 | RUN mkdir -p ~/.pip 15 | RUN echo "[global]\nindex-url = https://pypi.tuna.tsinghua.edu.cn/simple" > ~/.pip/pip.conf 16 | 17 | # 使用pip安装requirements.txt 18 | RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn -r requirements.txt 19 | 20 | #克隆Hugging Face仓库 21 | RUN git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat 22 | 23 | #开启7860端口 24 | EXPOSE 7860 25 | 26 | #设置启动命令 27 | ENTRYPOINT ["python", "examples/chat_gradio.py", "--model_name_or_path", "/root/Llama-Chinese/Atom-7B-Chat/"] 28 | -------------------------------------------------------------------------------- /docker/Dockerfile_train: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel as builder 2 | RUN apt-get update -y --allow-unauthenticated 3 | RUN apt install git tmux htop vim -y 4 | RUN pip install bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn 5 | RUN pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn 6 | RUN pip install peft -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn 7 | RUN pip install accelerate -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn 8 | RUN pip install deepspeed -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn 9 | RUN pip install scipy sentencepiece datasets joblib sentence_transformers cn2an evaluate tensorboard wandb -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | app: 4 | image: flagalpha/llama2-chinese:gradio # 这里替换为你实际的镜像名 5 | volumes: 6 | - /usr/local/nvidia:/usr/local/nvidia # 让容器访问主机的NVIDIA驱动 7 | environment: 8 | - NVIDIA_VISIBLE_DEVICES=all # 让容器可以访问所有的NVIDIA GPU 9 | ports: 10 | - 7860:7860 # 在容器和主机之间映射端口 11 | deploy: 12 | resources: 13 | reservations: 14 | devices: 15 | - driver: nvidia 16 | capabilities: [gpu] # 使用Docker的设备请求来让容器使用GPU 17 | -------------------------------------------------------------------------------- /docs/chat_gradio_guide.md: -------------------------------------------------------------------------------- 1 | # Docker环境执行chat_gradio.py 2 | 3 | 系统需要准备的环境 4 | 5 | + docker: 24.0.2 6 | + docker-compose 7 | 8 | ## 第一步. 准备Docker镜像 9 | 10 | 通过docker镜像可以更方便的管理需要安装的环境依赖。所以这里可以直接通过docker容器启动[chat_gradio](../examples/chat_gradio.py), 第一步准备镜像环境。 11 | 12 | ```bash 13 | git clone https://github.com/LlamaFamily/Llama-Chinese.git 14 | 15 | cd Llama-Chinese 16 | 17 | docker build -f docker/Dockerfile -t FlagAlpha/llama2-chinese:gradio . 18 | ``` 19 | 20 | ## 第二步. 通过docker-compose启动chat_gradio 21 | 22 | 23 | ```bash 24 | cd Llama-Chinese/docker 25 | doker-compose up -d --build 26 | ``` -------------------------------------------------------------------------------- /docs/inference_speed_guide.md: -------------------------------------------------------------------------------- 1 | # 推理部署 2 | 3 | > 训练完之后或者经过微调之后的模型或者直接从[huggingface](https://huggingface.co/FlagAlpha)下载的模型,都需要部署使用。部署也就是指的模型推理,如果直接使用原生的trainsfomers进行部署,速度会比较慢。针对推理有多种加速手段,会带来较快的推理速度。 4 | 5 | 6 | 7 | ## 1. GPU推理方案 8 | 9 | ### 方案一:vllm 10 | 11 | [使用说明](../inference-speed/GPU/vllm_example/README.md) 12 | 13 | ### 方案二:TensorRT-LLM 14 | 15 | [使用说明](../inference-speed/GPU/TensorRT-LLM_example/README.md) 16 | 17 | 18 | ## 2. CPU 推理方案 19 | 20 | ### 方案一:ggml 21 | [使用说明](../inference-speed/CPU/ggml/README.md) 22 | -------------------------------------------------------------------------------- /examples/chat_gradio.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import time 3 | from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer 4 | from threading import Thread 5 | import torch,sys,os 6 | import json 7 | import pandas 8 | import argparse 9 | 10 | with gr.Blocks() as demo: 11 | gr.Markdown("""

智能助手

""") 12 | chatbot = gr.Chatbot() 13 | msg = gr.Textbox() 14 | state = gr.State() 15 | with gr.Row(): 16 | clear = gr.Button("新话题") 17 | re_generate = gr.Button("重新回答") 18 | sent_bt = gr.Button("发送") 19 | with gr.Accordion("生成参数", open=False): 20 | slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3) 21 | slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95) 22 | slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0) 23 | def user(user_message, history): 24 | return "", history + [[user_message, None]] 25 | def bot(history,temperature,top_p,slider_context_times): 26 | if pandas.isnull(history[-1][1])==False: 27 | history[-1][1] = None 28 | yield history 29 | slider_context_times = int(slider_context_times) 30 | history_true = history[1:-1] 31 | prompt = '' 32 | if slider_context_times>0: 33 | prompt += '\n'.join([("Human: "+one_chat[0].replace('
','\n')+'\n
' if one_chat[0] else '') +"Assistant: "+one_chat[1].replace('
','\n')+'\n
' for one_chat in history_true[-slider_context_times:] ]) 34 | prompt += "Human: "+history[-1][0].replace('
','\n')+"\n
Assistant:" 35 | input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda') 36 | generate_input = { 37 | "input_ids":input_ids, 38 | "max_new_tokens":512, 39 | "do_sample":True, 40 | "top_k":50, 41 | "top_p":top_p, 42 | "temperature":temperature, 43 | "repetition_penalty":1.3, 44 | "streamer":streamer, 45 | "eos_token_id":tokenizer.eos_token_id, 46 | "bos_token_id":tokenizer.bos_token_id, 47 | "pad_token_id":tokenizer.pad_token_id 48 | } 49 | thread = Thread(target=model.generate, kwargs=generate_input) 50 | thread.start() 51 | start_time = time.time() 52 | bot_message ='' 53 | print('Human:',history[-1][0]) 54 | print('Assistant: ',end='',flush=True) 55 | for new_text in streamer: 56 | print(new_text,end='',flush=True) 57 | if len(new_text)==0: 58 | continue 59 | if new_text!='': 60 | bot_message+=new_text 61 | if 'Human:' in bot_message: 62 | bot_message = bot_message.split('Human:')[0] 63 | history[-1][1] = bot_message 64 | yield history 65 | end_time =time.time() 66 | print() 67 | print('生成耗时:',end_time-start_time,'文字长度:',len(bot_message),'字耗时:',(end_time-start_time)/len(bot_message)) 68 | 69 | msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( 70 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot 71 | ) 72 | sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( 73 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot 74 | ) 75 | re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot ) 76 | clear.click(lambda: [], None, chatbot, queue=False) 77 | 78 | if __name__ == "__main__": 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument("--model_name_or_path", type=str, help='mode name or path') 81 | parser.add_argument("--is_4bit", action='store_true', help='use 4bit model') 82 | args = parser.parse_args() 83 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False) 84 | tokenizer.pad_token = tokenizer.eos_token 85 | if args.is_4bit==False: 86 | model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, 87 | device_map='cuda:0' if torch.cuda.is_available() else "auto", 88 | torch_dtype=torch.float16, 89 | load_in_8bit=True, 90 | trust_remote_code=True, 91 | use_flash_attention_2=True) 92 | model.eval() 93 | else: 94 | from auto_gptq import AutoGPTQForCausalLM 95 | model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False) 96 | streamer = TextIteratorStreamer(tokenizer,skip_prompt=True) 97 | if torch.__version__ >= "2" and sys.platform != "win32": 98 | model = torch.compile(model) 99 | demo.queue().launch(share=False, debug=True,server_name="0.0.0.0") 100 | -------------------------------------------------------------------------------- /examples/chat_gradio_no_merge.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import time 3 | from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer 4 | from threading import Thread 5 | from peft import PeftModel,PeftConfig 6 | import torch,sys,os 7 | import json 8 | import pandas 9 | import argparse 10 | 11 | with gr.Blocks() as demo: 12 | gr.Markdown("""

智能助手

""") 13 | chatbot = gr.Chatbot() 14 | msg = gr.Textbox() 15 | state = gr.State() 16 | with gr.Row(): 17 | clear = gr.Button("新话题") 18 | re_generate = gr.Button("重新回答") 19 | sent_bt = gr.Button("发送") 20 | with gr.Accordion("生成参数", open=False): 21 | slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3) 22 | slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95) 23 | slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0) 24 | def user(user_message, history): 25 | return "", history + [[user_message, None]] 26 | def bot(history,temperature,top_p,slider_context_times): 27 | if pandas.isnull(history[-1][1])==False: 28 | history[-1][1] = None 29 | yield history 30 | slider_context_times = int(slider_context_times) 31 | history_true = history[1:-1] 32 | prompt = '' 33 | if slider_context_times>0: 34 | prompt += '\n'.join([("Human: "+one_chat[0].replace('
','\n')+'\n
' if one_chat[0] else '') +"Assistant: "+one_chat[1].replace('
','\n')+'\n
' for one_chat in history_true[-slider_context_times:] ]) 35 | prompt += "Human: "+history[-1][0].replace('
','\n')+"\n
Assistant:" 36 | input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda') 37 | generate_input = { 38 | "input_ids":input_ids, 39 | "max_new_tokens":512, 40 | "do_sample":True, 41 | "top_k":50, 42 | "top_p":top_p, 43 | "temperature":temperature, 44 | "repetition_penalty":1.3, 45 | "streamer":streamer, 46 | "eos_token_id":tokenizer.eos_token_id, 47 | "bos_token_id":tokenizer.bos_token_id, 48 | "pad_token_id":tokenizer.pad_token_id 49 | } 50 | thread = Thread(target=model.generate, kwargs=generate_input) 51 | thread.start() 52 | start_time = time.time() 53 | bot_message ='' 54 | print('Human:',history[-1][0]) 55 | print('Assistant: ',end='',flush=True) 56 | for new_text in streamer: 57 | print(new_text,end='',flush=True) 58 | if len(new_text)==0: 59 | continue 60 | if new_text!='': 61 | bot_message+=new_text 62 | if 'Human:' in bot_message: 63 | bot_message = bot_message.split('Human:')[0] 64 | history[-1][1] = bot_message 65 | yield history 66 | end_time =time.time() 67 | print() 68 | print('生成耗时:',end_time-start_time,'文字长度:',len(bot_message),'字耗时:',(end_time-start_time)/len(bot_message)) 69 | 70 | msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( 71 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot 72 | ) 73 | sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( 74 | bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot 75 | ) 76 | re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot ) 77 | clear.click(lambda: [], None, chatbot, queue=False) 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument("--model_name_or_path", type=str, help='mode name or path') 82 | parser.add_argument("--is_4bit", action='store_true', help='use 4bit model') 83 | args = parser.parse_args() 84 | # tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False) 85 | # tokenizer.pad_token = tokenizer.eos_token 86 | if args.is_4bit==False: 87 | config = PeftConfig.from_pretrained(args.model_name_or_path) 88 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,use_fast=False) 89 | tokenizer.pad_token = tokenizer.eos_token 90 | model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, 91 | device_map='cuda:0' if torch.cuda.is_available() else "auto", 92 | torch_dtype=torch.float16, 93 | load_in_8bit=True, 94 | low_cpu_mem_usage=True, 95 | trust_remote_code=True, 96 | use_flash_attention_2=True) 97 | model = PeftModel.from_pretrained(model, args.model_name_or_path, device_map={"": 0}) 98 | model.eval() 99 | else: 100 | from auto_gptq import AutoGPTQForCausalLM 101 | model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False) 102 | streamer = TextIteratorStreamer(tokenizer,skip_prompt=True) 103 | if torch.__version__ >= "2" and sys.platform != "win32": 104 | model = torch.compile(model) 105 | demo.queue().launch(share=False, debug=True,server_name="0.0.0.0") 106 | -------------------------------------------------------------------------------- /examples/llama2_for_langchain.py: -------------------------------------------------------------------------------- 1 | from langchain.llms.base import LLM 2 | from typing import Dict, List, Any, Optional 3 | import torch,sys,os 4 | from transformers import AutoTokenizer 5 | 6 | 7 | class Llama2(LLM): 8 | max_token: int = 2048 9 | temperature: float = 0.1 10 | top_p: float = 0.95 11 | tokenizer: Any 12 | model: Any 13 | 14 | def __init__(self, model_name_or_path, bit4=False): 15 | super().__init__() 16 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=False) 17 | self.tokenizer.pad_token = self.tokenizer.eos_token 18 | if bit4==False: 19 | from transformers import AutoModelForCausalLM 20 | device_map = "cuda:0" if torch.cuda.is_available() else "auto" 21 | self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True) 22 | self.model.eval() 23 | else: 24 | from auto_gptq import AutoGPTQForCausalLM 25 | self.model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False) 26 | 27 | if torch.__version__ >= "2" and sys.platform != "win32": 28 | self.model = torch.compile(self.model) 29 | 30 | @property 31 | def _llm_type(self) -> str: 32 | return "Llama2" 33 | 34 | def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: 35 | print('prompt:',prompt) 36 | input_ids = self.tokenizer(prompt, return_tensors="pt",add_special_tokens=False).input_ids.to('cuda') 37 | generate_input = { 38 | "input_ids":input_ids, 39 | "max_new_tokens":1024, 40 | "do_sample":True, 41 | "top_k":50, 42 | "top_p":self.top_p, 43 | "temperature":self.temperature, 44 | "repetition_penalty":1.2, 45 | "eos_token_id":self.tokenizer.eos_token_id, 46 | "bos_token_id":self.tokenizer.bos_token_id, 47 | "pad_token_id":self.tokenizer.pad_token_id 48 | } 49 | generate_ids = self.model.generate(**generate_input) 50 | generate_ids = [item[len(input_ids[0]):-1] for item in generate_ids] 51 | result_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 52 | return result_message 53 | -------------------------------------------------------------------------------- /inference-speed/CPU/ggml/README.md: -------------------------------------------------------------------------------- 1 | ## 使用llama.cpp量化部署 2 | 3 | 以[llama.cpp工具](https://github.com/Rayrtfr/llama.cpp)为例,介绍模型量化并在本地部署的详细步骤。Windows则可能需要cmake等编译工具的安装。**本地快速部署体验推荐使用经过指令精调的[Atom-7B-Chat](https://github.com/LlamaFamily/Llama-Chinese?tab=readme-ov-file#%E5%9F%BA%E4%BA%8Ellama2%E7%9A%84%E4%B8%AD%E6%96%87%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8Batom)模型,有条件的推荐使用6-bit或者8-bit模型,效果更佳。** 运行前请确保: 4 | 5 | 1. 系统应有`make`(MacOS/Linux自带)或`cmake`(Windows需自行安装)编译工具 6 | 2. 建议使用Python 3.10以上编译和运行该工具 7 | 8 | 9 | ### Step 1: 克隆和编译llama.cpp 10 | 11 | 1. (可选)如果已下载旧版仓库,建议`git pull`拉取最新代码,**并执行`make clean`进行清理** 12 | 1. 拉取最新版适配过Atom大模型的llama.cpp仓库代码 13 | 14 | ```bash 15 | $ git clone https://github.com/Rayrtfr/llama.cpp 16 | ``` 17 | 18 | 2. 对llama.cpp项目进行编译,生成`./main`(用于推理)和`./quantize`(用于量化)二进制文件。 19 | 20 | ```bash 21 | $ make 22 | ``` 23 | 24 | **Windows/Linux用户**如需启用GPU推理,则推荐与[BLAS(或cuBLAS如果有GPU)一起编译](https://github.com/Rayrtfr/llama.cpp#blas-build),可以提高prompt处理速度。以下是和cuBLAS一起编译的命令,适用于NVIDIA相关GPU。参考:[llama.cpp#blas-build](https://github.com/Rayrtfr/llama.cpp#blas-build) 25 | 26 | ```bash 27 | $ make LLAMA_CUBLAS=1 28 | ``` 29 | 30 | **macOS用户**无需额外操作,llama.cpp已对ARM NEON做优化,并且已自动启用BLAS。M系列芯片推荐使用Metal启用GPU推理,显著提升速度。只需将编译命令改为:`LLAMA_METAL=1 make`,参考[llama.cpp#metal-build](https://github.com/Rayrtfr/llama.cpp#metal-build) 31 | 32 | ```bash 33 | $ LLAMA_METAL=1 make 34 | ``` 35 | 36 | ### Step 2: 生成量化版本模型 37 | 38 | 目前llama.cpp已支持`.safetensors`文件以及huggingface格式`.bin`转换为GGUF的FP16格式。 39 | 40 | /path/Atom-7B-Chat是模型下载的目录位置。 41 | ```bash 42 | $ python convert.py --outfile ./atom-7B-cpp.gguf /path/Atom-7B-Chat 43 | 44 | $ ./quantize ./atom-7B-cpp.gguf ./ggml-atom-7B-q4_0.gguf q4_0 45 | ``` 46 | 47 | ### Step 3: 加载并启动模型 48 | 49 | 50 | - 如果想使用GPU推理:cuBLAS/Metal编译需要指定offload层数,在`./main`中指定例如`-ngl 40`表示offload 40层模型参数到GPU 51 | 52 | 53 | 使用以下命令启动聊天。 54 | ```bash 55 | text="Human: 介绍一下北京\nAssistant:" 56 | ./main -m \ 57 | ./ggml-atom-7B-q4_0.gguf \ 58 | -p "${text}" \ 59 | --logdir ./logtxt 60 | ``` 61 | 如果要带聊天的上下文,上面的text需要调整成类似这样: 62 | ```bash 63 | text="Human: 介绍一下北京\nAssistant:北京是一个美丽的城市\nHuman: 再介绍一下合肥\nAssistant:" 64 | ``` 65 | 66 | 更详细的官方说明请参考:[https://github.com/ggerganov/llama.cpp/tree/master/examples/main](https://github.com/ggerganov/llama.cpp/tree/master/examples/main) 67 | -------------------------------------------------------------------------------- /inference-speed/GPU/FasterTransformer_example/README.md: -------------------------------------------------------------------------------- 1 | # FasterTransformer && Triton 安装和使用 2 | 3 | FasterTransformer & Triton 加速LLama2模型推理。 目前支持fp16或者Int8推理,Int4目前还不支持。 4 | 5 | ## 0. 准备环境变量 6 | 7 | ```bash 8 | export BUILD_DICTIONARY="/workspace/build" 9 | export TRITON_VERSION=23.04 10 | ``` 11 | 12 | 13 | ## 一. 镜像构建 14 | 15 | 16 | 1. 构建镜像 17 | 18 | ```bash 19 | cd $BUILD_DICTIONARY 20 | git clone https://github.com/Rayrtfr/fastertransformer_backend.git 21 | 22 | cd $BUILD_DICTIONARY/fastertransformer_backend 23 | 24 | export TRITON_VERSION=23.04 25 | 26 | # 如何不想通过下面的命令构建,也可以直接下载我们已经构建好的镜像: docker pull xiangtao1994/atom_triton_ft:23.04 27 | docker build --build-arg TRITON_VERSION=${TRITON_VERSION} -t triton_ft_backend:${TRITON_VERSION} -f docker/Dockerfile . 28 | 29 | ``` 30 | TRITON_VERSION=23.04 这个镜像需的GPU的驱动版本是 Driver Version: 535.54.03,如果你的GPU的驱动不是这个版本,需要[https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12) 31 | 找到cuda driver 对应版本的 triton-inference-server。 32 | 33 | 34 | 2.启动容器 35 | 36 | ``` 37 | # 启动容器 38 | export TRITON_VERSION=23.04 39 | 40 | # 注意需要 BUILD_DICTIONARY 挂载到容器里面 41 | docker run -idt --gpus=all --net=host --shm-size=4G --name triton_ft_backend_pure \ 42 | -v $BUILD_DICTIONARY:$BUILD_DICTIONARY \ 43 | -p18888:8888 -p18000:8000 -p18001:8001 -p18002:8002 triton_ft_backend:${TRITON_VERSION} bash 44 | 45 | ```` 46 | 47 | ## 二.容器内操作 48 | 49 | 下面介绍一下[Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)模型的权重转换成FasterTransformer格式。 [Llama2-Chinese-13b-Chat](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat)也是类似的方式。 50 | 51 | 1. 转换权重, 权重转换成FasterTransformer格式 52 | 53 | ``` 54 | cd $BUILD_DICTIONARY && git clone https://github.com/Rayrtfr/FasterTransformer.git 55 | 56 | cd $BUILD_DICTIONARY/FasterTransformer 57 | 58 | mkdir models && chmod -R 777 ./* 59 | 60 | python3 ./examples/cpp/llama/huggingface_llama_convert.py \ 61 | -saved_dir=./models/llama \ 62 | -in_file=/path/FlagAlpha/Atom-7B-Chat \ 63 | -infer_gpu_num=1 \ 64 | -weight_data_type=fp16 \ 65 | -model_name=llama 66 | ``` 67 | 68 | 2. 修改模型配置 69 | 70 | - 编辑config.pbtxt 71 | 72 | ``` bash 73 | mkdir $BUILD_DICTIONARY/triton-model-store/ 74 | 75 | cd $BUILD_DICTIONARY/triton-model-store/ 76 | 77 | cp -r $BUILD_DICTIONARY/fastertransformer_backend/all_models/llama $BUILD_DICTIONARY/triton-model-store/ 78 | 79 | # 修改 triton-model-store/llama/fastertransformer/config.pbtxt 80 | 81 | parameters { 82 | key: "tensor_para_size" 83 | value: { 84 | string_value: "1" 85 | } 86 | } 87 | 88 | ## 修改 model_checkpoint_path 为上面转换之后的路径 89 | parameters { 90 | key: "model_checkpoint_path" 91 | value: { 92 | string_value: "/workspace/build/FasterTransformer/models/llama/1-gpu/" 93 | } 94 | } 95 | 96 | ## 模型使用int8推理需要加一下面的配置 97 | parameters { 98 | key: "int8_mode" 99 | value: { 100 | string_value: "1" 101 | } 102 | } 103 | ``` 104 | 105 | 106 | 修改 model.py 107 | 108 | ``` 109 | # 修改这两个文件 110 | triton-model-store/llama/preprocessing/1/model.py 111 | triton-model-store/llama/postprocessing/1/model.py 112 | 113 | # 检查 这个路径为tokenier对应的路径 114 | self.tokenizer = LlamaTokenizer.from_pretrained("/path/FlagAlpha/Atom-7B-Chat") 115 | ``` 116 | 117 | 118 | 3. 编译 FasterTransformer Library 119 | 120 | (同一类型的模型,编译一次就行了) 121 | 编译之前检查 FasterTransformer/examples/cpp/llama/llama_config.ini 122 | 123 | ```bash 124 | # 单卡推理这里是1,多卡可以改成卡的数目 125 | tensor_para_size=1 126 | 127 | model_dir=/workspace/build/FasterTransformer/models/llama/1-gpu/ 128 | ``` 129 | 130 | 编译 FasterTransformer 131 | ```bash 132 | cd $BUILD_DICTIONARY/FasterTransformer 133 | 134 | git submodule init && git submodule update 135 | pip3 install fire jax jaxlib transformers 136 | 137 | mkdir build && cd build 138 | cmake -DSM=86 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON -D PYTHON_PATH=/usr/bin/python3 .. 139 | make -j12 140 | make install 141 | ``` 142 | 143 | 144 | ## 三. 启动 triton server 145 | 146 | 同样在上面的容器内操作。 147 | ``` 148 | CUDA_VISIBLE_DEVICES=0 /opt/tritonserver/bin/tritonserver --model-repository=$BUILD_DICTIONARY/triton-model-store/llama/ 149 | ``` 150 | 输出 151 | ``` 152 | I0717 17:17:14.670037 70681 grpc_server.cc:2450] Started GRPCInferenceService at 0.0.0.0:8001 153 | I0717 17:17:14.670495 70681 http_server.cc:3555] Started HTTPService at 0.0.0.0:8000 154 | I0717 17:17:14.713000 70681 http_server.cc:185] Started Metrics Service at 0.0.0.0:8002 155 | ``` 156 | 157 | 158 | 同样在上面的容器内操作,启动client测试(如果在容器外注意需要修改下面的url参数的端口号) 159 | 160 | ``` 161 | python3 $BUILD_DICTIONARY/fastertransformer_backend/inference_example/llama/llama_grpc_stream_client.py \ 162 | --url 127.0.0.1:8001 \ 163 | --hf_model_location /path/FlagAlpha/Atom-7B-Chat \ 164 | -topp 0.95 165 | ``` 166 | -------------------------------------------------------------------------------- /inference-speed/GPU/JittorLLMs_example/README.md: -------------------------------------------------------------------------------- 1 | # JittorLLMs推理部署 2 | 3 | ## 配置要求 4 | 5 | * 内存要求:至少2G,推荐32G 6 | * 显存:可选, 推荐16G 7 | * 操作系统:支持Windows,Mac,Linux全平台。 8 | * 磁盘空间:至少40GB空闲磁盘空间,用于下载参数和存储交换文件。 9 | * Python版本要求至少`3.9`。 10 | 11 | 磁盘空间不够时,可以通过环境变量`JITTOR_HOME`指定缓存存放路径。 12 | 内存或者显存不够,出现进程被杀死的情况,请参考下方,[限制内存消耗的方法](#配置要求低)。 13 | 14 | ## 部署方法 15 | 16 | 可以通过下述指令安装依赖。(注意:此脚本会安装Jittor版torch,推荐用户新建环境运行) 17 | 18 | ``` 19 | # 国内使用 gitlink clone 20 | git clone https://gitlink.org.cn/jittor/JittorLLMs.git --depth 1 21 | # github: git clone https://github.com/Jittor/JittorLLMs.git --depth 1 22 | cd JittorLLMs 23 | # -i 指定用jittor的源, -I 强制重装Jittor版torch 24 | pip install -r requirements.txt -i https://pypi.jittor.org/simple -I 25 | ``` 26 | 27 | 如果出现找不到jittor版本的错误,可能是您使用的镜像还没有更新,使用如下命令更新最新版:`pip install jittor -U -i https://pypi.org/simple` 28 | 29 | 部署只需一行命令即可: 30 | 31 | ``` 32 | python cli_demo.py atom7b 33 | ``` 34 | 35 | 运行后会自动从服务器上下载模型文件到本地,会占用根目录下一定的硬盘空间。 36 | 最开始运行的时候会编译一些CUDA算子,这会花费一些时间进行加载。 37 | 38 | 内存或者显存不够,出现进程被杀死的情况,请参考下方,[限制内存消耗的方法](#配置要求低)。 39 | 40 | ### WebDemo 41 | 42 | JittorLLM通过gradio库,允许用户在浏览器之中和大模型直接进行对话。 43 | 44 | ~~~bash 45 | python web_demo.py atom7b 46 | ~~~ 47 | 48 | ### 后端服务部署 49 | 50 | JittorLLM在api.py文件之中,提供了一个架设后端服务的示例。 51 | 52 | ~~~bash 53 | python api.py atom7b 54 | ~~~ 55 | 56 | 接着可以使用如下代码进行直接访问 57 | 58 | ~~~python 59 | post_data = json.dumps({'prompt': 'Hello, solve 5x=13'}) 60 | print(json.loads(requests.post("http://0.0.0.0:8000", post_data).text)['response']) 61 | ~~~ 62 | 63 | ## 配置要求低 64 | 65 | 针对大模型显存消耗大等痛点,Jittor团队研发了动态交换技术,Jittor框架是世界上首个支持动态图变量自动交换功能的框架,区别于以往的基于静态图交换技术,用户不需要修改任何代码,原生的动态图代码即可直接支持张量交换,张量数据可以在显存-内存-硬盘之间自动交换,降低用户开发难度。 66 | 67 | 同时,Jittor大模型推理库也是目前对配置门槛要求最低的框架,只需要参数磁盘空间和2G内存,无需显卡,也可以部署大模型,下面是在不同硬件配置条件下的资源消耗与速度对比。可以发现,JittorLLMs在显存充足的情况下,性能优于同类框架,而显存不足甚至没有显卡,JittorLLMs都能以一定速度运行。 68 | 69 | 节省内存方法,请安装Jittor版本大于1.3.7.8,并添加如下环境变量: 70 | ```bash 71 | export JT_SAVE_MEM=1 72 | # 限制cpu最多使用16G 73 | export cpu_mem_limit=16000000000 74 | # 限制device内存(如gpu、tpu等)最多使用8G 75 | export device_mem_limit=8000000000 76 | # windows 用户,请使用powershell 77 | # $env:JT_SAVE_MEM="1" 78 | # $env:cpu_mem_limit="16000000000" 79 | # $env:device_mem_limit="8000000000" 80 | ``` 81 | 用户可以自由设定cpu和设备内存的使用量,如果不希望对内存进行限制,可以设置为`-1`。 82 | ```bash 83 | # 限制cpu最多使用16G 84 | export cpu_mem_limit=-1 85 | # 限制device内存(如gpu、tpu等)最多使用8G 86 | export device_mem_limit=-1 87 | # windows 用户,请使用powershell 88 | # $env:JT_SAVE_MEM="1" 89 | # $env:cpu_mem_limit="-1" 90 | # $env:device_mem_limit="-1" 91 | ``` 92 | 93 | 如果想要清理磁盘交换文件,可以运行如下命令 94 | ```bash 95 | python -m jittor_utils.clean_cache swap 96 | ``` 97 | -------------------------------------------------------------------------------- /inference-speed/GPU/TensorRT-LLM_example/README.md: -------------------------------------------------------------------------------- 1 | # 使用NVIDIA TensorRT-LLM部署LLama2 或者Atom 2 | 3 | [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架,您可以按照以下步骤来使用TensorRT-LLM部署LLama2模型或者Atom模型。 4 | 5 | 以下部署流程参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama),需要机器Nvidia显卡驱动535版本以上 6 | 7 | ## Support Matrix 8 | * FP16 9 | * FP8 10 | * INT8 & INT4 Weight-Only 11 | * SmoothQuant 12 | * Groupwise quantization (AWQ/GPTQ) 13 | * FP8 KV CACHE 14 | * INT8 KV CACHE (+ AWQ/per-channel weight-only) 15 | * Tensor Parallel 16 | * STRONGLY TYPED 17 | 18 | ## 1. 安装TensorRT-LLM 19 | #### 获取TensorRT-LLM代码: 20 | 21 | ```bash 22 | # TensorRT-LLM 代码需要使用 git-lfs 拉取 23 | apt-get update && apt-get -y install git git-lfs 24 | 25 | git clone https://github.com/NVIDIA/TensorRT-LLM.git 26 | cd TensorRT-LLM 27 | 28 | # 本流程将使用 v0.7.0 Release 版本 29 | git checkout tags/v0.7.0 -b release/0.7.0 30 | git submodule update --init --recursive 31 | git lfs install 32 | git lfs pull 33 | ``` 34 | #### 构建docker镜像并安装TensorRT-LLM 35 | ```bash 36 | make -C docker release_build 37 | ``` 38 | 39 | #### 运行docker镜像: 40 | ```bash 41 | make -C docker release_run 42 | ``` 43 | 44 | ## 2. 为LLama2模型构建TensorRT-LLM推理引擎: 45 | 46 | #### 进入build文件夹: 47 | ```bash 48 | cd ./examples/llama 49 | ``` 50 | 51 | #### 从Huggingface下载Atom或者LLama2模型: 52 | ``` 53 | # 您可以选择具体想部署的模型下载 54 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat 55 | mv Atom-7B-Chat /origin_model 56 | ``` 57 | 58 | #### 使用build.py 构建推理引擎: 59 | 以下是一个常见事例,更多参数参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) 60 | ```bash 61 | python build.py --max_batch_size 1 --max_num_tokens 8192 --model_dir /origin_model --dtype float16 --remove_input_padding --use_inflight_batching --paged_kv_cache --use_weight_only --enable_context_fmha --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --output_dir /model/tensorrt_llm/1 --world_size 1 --tp_size 1 --pp_size 1 --max_input_len 7168 --max_output_len 1024 --multi_block_mode --rotary_scaling dynamic 8.0 --rotary_base 500000 62 | ``` 63 | 64 | ## 3. 使用TensorRT-LLM Python Runtime进行推理 65 | 66 | #### 使用我们提供的python代码类,启动单机单卡服务 67 | ```bash 68 | python atom_inference.py \ 69 | /model/tensorrt_llm/1 \ # 第一个参数 build.py 的output路径 70 | /origin_model \ # 第二个参数模型tokenizer的路径 71 | 如何成为一个更加优秀的人 # 希望问的问题 72 | ``` -------------------------------------------------------------------------------- /inference-speed/GPU/TensorRT-LLM_example/atom_inference.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import torch 7 | from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, 8 | load_tokenizer, read_model_name, throttle_generator) 9 | 10 | import tensorrt_llm 11 | from tensorrt_llm.logger import logger 12 | from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner 13 | 14 | if PYTHON_BINDINGS: 15 | from tensorrt_llm.runtime import ModelRunnerCpp 16 | 17 | class AtomTRTApi: 18 | def __init__(self,engine_dir,tokenizer_dir,max_input_length=4096): 19 | self.runtime_rank = tensorrt_llm.mpi_rank() 20 | self.model_name = read_model_name(engine_dir) 21 | 22 | self.tokenizer, self.pad_id, self.end_id = load_tokenizer( 23 | tokenizer_dir=tokenizer_dir, 24 | tokenizer_type='llama', 25 | ) 26 | self.use_py_session=False 27 | if not PYTHON_BINDINGS: 28 | logger.warning( 29 | "Python bindings of C++ session is unavailable, fallback to Python session." 30 | ) 31 | self.use_py_session = True 32 | runner_cls = ModelRunner if self.use_py_session else ModelRunnerCpp 33 | runner_kwargs = dict(engine_dir=engine_dir, 34 | lora_dir=None, 35 | rank=self.runtime_rank, 36 | debug_mode=False, 37 | lora_ckpt_source='hf') 38 | 39 | if not self.use_py_session: 40 | runner_kwargs.update( 41 | max_batch_size=1, 42 | max_input_len=max_input_length, 43 | max_output_len=2048, 44 | max_beam_width=1, 45 | max_attention_window_size=None) 46 | self.runner = runner_cls.from_dir(**runner_kwargs) 47 | 48 | 49 | def ask(self,input_text,temperature=0.4,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',merge_lambda=None,max_input_length=4096,append_next_role=True): 50 | with torch.no_grad(): 51 | prompt = '' 52 | print('max_input_length',max_input_length) 53 | if type(input_text)==list: 54 | for input_text_one in input_text[::-1]: 55 | if len(prompt) + len(""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n")" + prompt 57 | if append_next_role: 58 | if input_text[-1]['role']=='Human': 59 | prompt += "Assistant:" 60 | else: 61 | prompt += "Human:" 62 | else: 63 | if merge_lambda is None: 64 | if append_next_role: 65 | prompt += "Human: "+input_text.strip()+"\nAssistant:" 66 | else: 67 | prompt += "Human: "+input_text.strip()+"\n" 68 | else: 69 | prompt += merge_lambda(input_text) 70 | if len(system_prefix)>0: 71 | prompt = 'System: '+system_prefix.strip()+'\n'+prompt 72 | print('输入模型的完整输入:',prompt) 73 | input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids] 74 | print(input_ids) 75 | input_ids = [ 76 | torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids 77 | ] 78 | print('输入模型的token数量',input_ids[0].shape) 79 | generate_input = { 80 | "batch_input_ids":input_ids, 81 | "max_new_tokens":max_new_tokens, 82 | "max_attention_window_size":None, 83 | "do_sample":True, 84 | "top_k":50, 85 | "top_p":top_p, 86 | "num_beams":1, 87 | "length_penalty":1.0, 88 | "stop_words_list":None, 89 | "bad_words_list":None, 90 | "streaming":False, 91 | "temperature":temperature, 92 | "output_sequence_lengths":True, 93 | "return_dict":False, 94 | "repetition_penalty":repetition_penalty, 95 | "end_id":self.tokenizer.eos_token_id, 96 | "bos_token_id":self.tokenizer.bos_token_id, 97 | "pad_id":self.tokenizer.pad_token_id 98 | } 99 | generate_ids = self.runner.generate(**generate_input) 100 | torch.cuda.synchronize() 101 | print(generate_ids) 102 | generate_ids = generate_ids.cpu().tolist() 103 | generate_ids = [item[0][len(input_ids[0][0]):] for item in generate_ids] 104 | try: 105 | generate_ids = [item[:item.index(self.tokenizer.eos_token_id)] for item in generate_ids ] 106 | except: 107 | pass 108 | print(generate_ids) 109 | # output = ''.join(tokenizer.convert_ids_to_tokens(generate_ids[0])) 110 | # print('生成的token长度',len(generate_ids[0])) 111 | bot_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 112 | if 'Human:' in bot_message: 113 | bot_message = bot_message.split('Human:')[0] 114 | print(bot_message) 115 | return bot_message.strip() 116 | 117 | def ask_streaming(self,input_text,temperature=0.8,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',max_input_length=4096,append_next_role=True): 118 | with torch.no_grad(): 119 | prompt = '' 120 | print('max_input_length',max_input_length) 121 | if type(input_text)==list: 122 | for input_text_one in input_text[::-1]: 123 | if len(prompt) + len(""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n")" + prompt 125 | if append_next_role: 126 | if input_text[-1]['role']=='Human': 127 | prompt += "Assistant:" 128 | else: 129 | prompt += "Human:" 130 | else: 131 | if append_next_role: 132 | prompt += "Human: "+input_text.strip()+"\nAssistant:" 133 | else: 134 | prompt += "Human: "+input_text.strip()+"\n" 135 | if len(system_prefix)>0: 136 | prompt = 'System: '+system_prefix.strip()+'\n'+prompt 137 | print('输入模型的完整输入:',prompt) 138 | input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids] 139 | print(input_ids) 140 | input_ids = [ 141 | torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids 142 | ] 143 | print('输入模型的token数量',input_ids[0].shape) 144 | generate_input = { 145 | "batch_input_ids":input_ids, 146 | "max_new_tokens":max_new_tokens, 147 | "max_attention_window_size":None, 148 | "do_sample":True, 149 | "top_k":50, 150 | "top_p":top_p, 151 | "num_beams":1, 152 | "length_penalty":1.0, 153 | "stop_words_list":None, 154 | "bad_words_list":None, 155 | "streaming":True, 156 | "temperature":temperature, 157 | "output_sequence_lengths":True, 158 | "return_dict":True, 159 | "repetition_penalty":repetition_penalty, 160 | "end_id":self.tokenizer.eos_token_id, 161 | "bos_token_id":self.tokenizer.bos_token_id, 162 | "pad_id":self.tokenizer.pad_token_id 163 | } 164 | generate_ids = self.runner.generate(**generate_input) 165 | torch.cuda.synchronize() 166 | 167 | input_token_num = len(input_ids[0][0]) 168 | answer_message ='' 169 | for curr_outputs in throttle_generator(generate_ids,2): 170 | output_ids = curr_outputs['output_ids'] 171 | sequence_lengths = curr_outputs['sequence_lengths'] 172 | # print(sequence_lengths) 173 | output_ids = output_ids.cpu().tolist() 174 | output_ids = [item[0][input_token_num:sequence_lengths[0][0]] for item in output_ids] 175 | answer_message = self.tokenizer.batch_decode(output_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 176 | if 'Human:' in answer_message: 177 | answer_message = answer_message.split('Human:')[0] 178 | yield answer_message.strip() 179 | return answer_message.strip() 180 | 181 | 182 | if __name__=='__main__': 183 | model = AtomTRTApi(engine_dir=sys.argv[1],tokenizer_dir=sys.argv[2]) 184 | model.ask('如何成为一个更优秀的人') 185 | -------------------------------------------------------------------------------- /inference-speed/GPU/TensorRT-LLM_example/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | from pathlib import Path 18 | from typing import Optional 19 | from typing import Union 20 | 21 | from transformers import AutoTokenizer, T5Tokenizer 22 | 23 | import tensorrt_llm 24 | 25 | DEFAULT_HF_MODEL_DIRS = { 26 | 'baichuan': 'baichuan-inc/Baichuan-13B-Chat', 27 | 'bloom': 'bigscience/bloom-560m', 28 | 'chatglm_6b': 'THUDM/chatglm-6b', 29 | 'chatglm2_6b': 'THUDM/chatglm2-6b', 30 | 'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k', 31 | 'chatglm3_6b': 'THUDM/chatglm3-6b', 32 | 'chatglm3_6b_base': 'THUDM/chatglm3-6b-base', 33 | 'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k', 34 | 'falcon': 'tiiuae/falcon-rw-1b', 35 | 'glm_10b': 'THUDM/glm-10b', 36 | 'gpt': 'gpt2-medium', 37 | 'gptj': 'EleutherAI/gpt-j-6b', 38 | 'gptneox': 'EleutherAI/gpt-neox-20b', 39 | 'internlm': 'internlm/internlm-chat-7b', 40 | 'llama': 'meta-llama/Llama-2-7b-hf', 41 | 'mpt': 'mosaicml/mpt-7b', 42 | 'phi': 'microsoft/phi-2', 43 | 'opt': 'facebook/opt-350m', 44 | 'qwen': 'Qwen/Qwen-7B', 45 | } 46 | 47 | DEFAULT_PROMPT_TEMPLATES = { 48 | 'internlm': 49 | "<|User|>:{input_text}\n<|Bot|>:", 50 | 'qwen': 51 | "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", 52 | } 53 | 54 | def get_engine_version(engine_dir: str) -> Union[None, str]: 55 | engine_dir = Path(engine_dir) 56 | config_path = engine_dir / "config.json" 57 | with open(config_path, 'r') as f: 58 | config = json.load(f) 59 | 60 | if 'version' not in config: 61 | return None 62 | 63 | return config['version'] 64 | 65 | def read_model_name(engine_dir: str): 66 | engine_version = get_engine_version(engine_dir) 67 | 68 | with open(Path(engine_dir) / "config.json", 'r') as f: 69 | config = json.load(f) 70 | 71 | if engine_version is None: 72 | return config['builder_config']['name'] 73 | 74 | return config['pretrained_config']['architecture'] 75 | 76 | 77 | def throttle_generator(generator, stream_interval): 78 | for i, out in enumerate(generator): 79 | if not i % stream_interval: 80 | yield out 81 | 82 | if i % stream_interval: 83 | yield out 84 | 85 | 86 | def load_tokenizer(tokenizer_dir: Optional[str] = None, 87 | vocab_file: Optional[str] = None, 88 | model_name: str = 'gpt', 89 | tokenizer_type: Optional[str] = None): 90 | if vocab_file is None: 91 | use_fast = True 92 | if tokenizer_type is not None and tokenizer_type == "llama": 93 | use_fast = False 94 | # Should set both padding_side and truncation_side to be 'left' 95 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, 96 | legacy=False, 97 | padding_side='left', 98 | truncation_side='left', 99 | trust_remote_code=True, 100 | tokenizer_type=tokenizer_type, 101 | use_fast=use_fast) 102 | else: 103 | # For gpt-next, directly load from tokenizer.model 104 | assert model_name == 'gpt' 105 | tokenizer = T5Tokenizer(vocab_file=vocab_file, 106 | padding_side='left', 107 | truncation_side='left') 108 | 109 | if model_name == 'qwen': 110 | with open(Path(tokenizer_dir) / "generation_config.json") as f: 111 | gen_config = json.load(f) 112 | chat_format = gen_config['chat_format'] 113 | if chat_format == 'raw': 114 | pad_id = gen_config['pad_token_id'] 115 | end_id = gen_config['eos_token_id'] 116 | elif chat_format == 'chatml': 117 | pad_id = tokenizer.im_end_id 118 | end_id = tokenizer.im_end_id 119 | else: 120 | raise Exception(f"unknown chat format: {chat_format}") 121 | elif model_name == 'glm_10b': 122 | pad_id = tokenizer.pad_token_id 123 | end_id = tokenizer.eop_token_id 124 | else: 125 | if tokenizer.pad_token_id is None: 126 | tokenizer.pad_token_id = tokenizer.eos_token_id 127 | pad_id = tokenizer.pad_token_id 128 | end_id = tokenizer.eos_token_id 129 | 130 | return tokenizer, pad_id, end_id 131 | -------------------------------------------------------------------------------- /inference-speed/GPU/lmdeploy_example/README.md: -------------------------------------------------------------------------------- 1 | # lmdeploy 安装和使用 2 | 3 | lmdeploy 支持 transformer 结构(例如 Atom、LLaMA、LLaMa2、InternLM、Vicuna 等),目前支持 fp16,int8 和 int4。 4 | 5 | ## 一、安装 6 | 7 | 安装预编译的 python 包 8 | ``` 9 | python3 -m pip install lmdeploy==0.2.1 10 | ``` 11 | 12 | ## 二、转换huggingface模型为lmdeploy格式 13 | 14 | 把模型转成 lmdeploy 推理格式,假设 huggingface 版 [Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat) 模型已下载到 `/models/Atom-7B-Chat` 目录,结果会存到 当前执行命令的`workspace` 文件夹 15 | 16 | ```shell 17 | lmdeploy convert llama2 /models/Atom-7B-Chat 18 | ``` 19 | lmdeploy 修改一处bug 20 | ``` 21 | sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/preprocessing/1/tokenizer/tokenizer.py 22 | sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/postprocessing/1/tokenizer/tokenizer.py 23 | ``` 24 | 25 | 26 | ## 三、kv cache int8 量化 27 | 对于最大长度是 2048 的 Atom-7B fp16 模型,服务端每创建 1 个并发,都需要大约 1030MB 显存保存 kv_cache,即便是 A100 80G,能服务的用户也非常有限。 28 | 为了降低运行时显存,lmdeploy 实现了 kv cache PTQ 量化,同样的显存可以服务更多并发用户。 29 | 首先计算模型参数,保存到临时目录 atom 30 | ```shell 31 | mkdir atom 32 | lmdeploy lite calibrate \ 33 | /models/Atom-7B-Chat \ # huggingface Atom 模型。也支持 llama/vicuna/internlm/baichuan 等 34 | --calib-dataset 'ptb' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval 35 | --calib-samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小 36 | --device 'cuda' \ # 单条的文本长度,如果显存不够,可以适当调小 37 | --work-dir atom # 保存 pth 格式量化统计参数和量化后权重的文件夹 38 | ``` 39 | 注意:可能需要安装flash_attn 40 | ```shell 41 | conda install -c nvidia cuda-nvcc # 为了使用conda内的cuda环境安装 flash_attn 42 | pip install flash_attn 43 | ``` 44 | 45 | 46 | 然后用 atom 目录里的参数,计算量化参数,保存到转换后参数到 `workspace/triton_models/weights` 下 47 | 48 | ```shell 49 | lmdeploy lite kv_qparams \ 50 | ./atom \ # 上一步计算的 atom 结果 51 | ./workspace/triton_models/weights \ # 结果保存目录 52 | --num-tp 1 # tensor parallel GPU 个数 53 | ``` 54 | 55 | 修改推理配置,开启 kv cache int8。编辑 `workspace/triton_models/weights/config.ini` 56 | * 把 `use_context_fmha` 改为 0,表示关闭 flashattention 57 | * 把 `quant_policy` 设为 4,表示打开 kv cache 量化 58 | 59 | 最终执行测试即可 60 | ```shell 61 | lmdeploy chat turbomind ./workspace 62 | ``` 63 | 64 | [点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/kv_int8.md) 查看 kv cache int8 量化实现公式、精度和显存测试报告。 65 | 66 | ## 四、weight int4 量化 67 | 68 | lmdeploy 基于 [AWQ 算法](https://arxiv.org/abs/2306.00978) 实现了 weight int4 量化,性能是 FP16 的 2.4 倍以上。显存从 16G 降低到 6.3G。 69 | 70 | 对于自己的模型,可以用`auto_awq`工具来优化 71 | ```shell 72 | # 指定量化导出的模型路径 73 | WORK_DIR="./atom-7b-chta-w4" 74 | 75 | lmdeploy lite auto_awq \ 76 | $HF_MODEL \ # huggingface 模型位置 77 | --calib-dataset 'ptb' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval 78 | --calib-samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小 79 | --calib-seqlen 2048 \ # 单条的文本长度,如果显存不够,可以适当调小 80 | --w-bits 4 \ # 权重量化的 bit 数 81 | --w-group-size 128 \ # 权重量化分组统计尺寸 82 | --work-dir $WORK_DIR 83 | ``` 84 | 85 | 执行以下命令,启动服务: 86 | ```shell 87 | # 这里的路径是上面步骤一中转换模型的layout的输出 88 | FasterTransformer_PATH="/path/workspace" 89 | 90 | TP=1 91 | # 指定需要用的显卡 92 | DEVICES="0" 93 | for ((i = 1; i < ${TP}; ++i)); do 94 | DEVICES="${DEVICES},$i" 95 | done 96 | DEVICES="\"device=${DEVICES}\"" 97 | 98 | # 在容器内启动服务 99 | docker run -idt \ 100 | --gpus $DEVICES \ 101 | -v $FasterTransformer_PATH:/workspace/models \ 102 | --shm-size 16g \ 103 | -p 33336:22 \ 104 | -p 33337-33400:33337-33400 \ 105 | --cap-add=SYS_PTRACE \ 106 | --cap-add=SYS_ADMIN \ 107 | --security-opt seccomp=unconfined \ 108 | --name lmdeploy \ 109 | --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \ 110 | tritonserver \ 111 | --model-repository=/workspace/models/model_repository \ 112 | --allow-http=0 \ 113 | --allow-grpc=1 \ 114 | --grpc-port=33337 \ 115 | --log-verbose=0 \ 116 | --allow-metrics=1 117 | ``` 118 | 119 | 客户端测试: 120 | ```shell 121 | python test_api_server.py --tritonserver_addr 127.0.0.1:33337 122 | ``` 123 | 124 | [点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md) 查看 weight int4 量化的显存和速度测试结果。 125 | 126 | 额外说明,weight int4 和 kv cache int8 二者并不冲突、可以同时打开,节约更多显存。 127 | -------------------------------------------------------------------------------- /inference-speed/GPU/lmdeploy_example/test_api_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from lmdeploy.serve.turbomind.chatbot import Chatbot 4 | 5 | def input_prompt(chat_history, system_prompt: str): 6 | """Input a prompt in the consolo interface.""" 7 | prompt = '' 8 | for input_text_one in chat_history: 9 | prompt += ""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n" 10 | if chat_history[-1]['role']=='Human': 11 | prompt += "Assistant: " 12 | else: 13 | prompt += "Human: " 14 | prompt = prompt[-2048:] 15 | if len(system_prompt)>0: 16 | prompt = 'System: '+system_prompt.strip()+'\n'+prompt 17 | 18 | return prompt 19 | 20 | def main(tritonserver_addr: str, 21 | session_id: int = 1, 22 | cap: str = 'chat', 23 | stream_output: bool = True, 24 | **kwargs): 25 | """An example to communicate with inference server through the command line 26 | interface. 27 | 28 | Args: 29 | tritonserver_addr (str): the address in format "ip:port" of 30 | triton inference server 31 | session_id (int): the identical id of a session 32 | cap (str): the capability of a model. For example, codellama has 33 | the ability among ['completion', 'infill', 'instruct', 'python'] 34 | stream_output (bool): indicator for streaming output or not 35 | **kwargs (dict): other arguments for initializing model's chat template 36 | """ 37 | log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING') 38 | kwargs.update(capability=cap) 39 | chatbot = Chatbot(tritonserver_addr, 40 | log_level=log_level, 41 | display=stream_output, 42 | **kwargs) 43 | nth_round = 1 44 | prompt = input_prompt([{"role": "Human", "content" : "心情不好怎么办"}], "") 45 | 46 | request_id = f'{session_id}-{nth_round}' 47 | begin = time.time() 48 | if stream_output: 49 | for status, res, n_token in chatbot.stream_infer( 50 | session_id, 51 | prompt, 52 | request_id=request_id, 53 | request_output_len=512): 54 | # print("n_token:", n_token) 55 | continue 56 | 57 | else: 58 | status, res, n_token = chatbot.infer(session_id, 59 | prompt, 60 | request_id=request_id, 61 | request_output_len=512) 62 | print(res) 63 | # print("n_token:", n_token) 64 | nth_round += 1 65 | end = time.time() 66 | speed = n_token/(end-begin) 67 | print("speed {} tokens/s".format(speed)) 68 | 69 | 70 | if __name__ == '__main__': 71 | import fire 72 | 73 | fire.Fire(main) 74 | -------------------------------------------------------------------------------- /inference-speed/GPU/vllm_example/README.md: -------------------------------------------------------------------------------- 1 | # vllm推理部署 2 | 3 | [vllm](https://github.com/vllm-project/vllm)同样是GPU推理的方案。相比较与FasterTrainsformer,vllm更加的简单易用。不需要额外进行模型的转换。支持fp16推理。 4 | 5 | 特点: 6 | 7 | + 快速的推理速度 8 | + 高效的kv cache 9 | + 连续的batch请求推理 10 | + 优化cuda算子 11 | + 支持分布式推理 12 | 13 | ## 第一步: 安装vllm 14 | 15 | ```bash 16 | pip install vllm 17 | ``` 18 | 19 | ## 第二步:启动测试server 20 | 21 | 从Huggingface下载Atom或者LLama3模型: 22 | ``` 23 | # 您可以选择具体想部署的模型下载 24 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat 25 | 26 | # 或者下载Meta官方的Llama3模型: 27 | git clone https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct Meta-Llama-3-8B-Instruct 28 | ``` 29 | 30 | 1. 单卡推理 31 | 32 | 编辑single_gpus_api_server.sh里面model为上面模型的下载路径。 33 | 34 | 启动测试server 35 | ```bash 36 | # multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡 37 | bash single_gpus_api_server.sh 38 | ``` 39 | 40 | 2. 多卡推理 41 | 42 | 13B模型,70B模型推荐多卡推理。编辑multi_gpus_api_server.sh里面model为上面的13B模型的下载路径。 43 | 44 | 启动测试server 45 | ```bash 46 | # multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡 47 | # tensor-parallel-size 指定了卡的个数 48 | bash multi_gpus_api_server.sh 49 | ``` 50 | 51 | ## 第三步:启动client测试 52 | 53 | 注意下面的model_source 模型的源,可以是 llama_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分,如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama_chinese。 54 | 55 | ``` 56 | python client_test.py --model_source llama_chinese 57 | ``` 58 | -------------------------------------------------------------------------------- /inference-speed/GPU/vllm_example/api_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import AsyncGenerator 4 | 5 | from fastapi import BackgroundTasks, FastAPI, Request 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse 7 | import uvicorn 8 | 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | from vllm.sampling_params import SamplingParams 12 | from vllm.utils import random_uuid 13 | 14 | TIMEOUT_KEEP_ALIVE = 5 # seconds. 15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. 16 | app = FastAPI() 17 | 18 | 19 | @app.post("/generate") 20 | async def generate(request: Request) -> Response: 21 | """Generate completion for the request. 22 | 23 | The request should be a JSON object with the following fields: 24 | - prompt: the prompt to use for the generation. 25 | - stream: whether to stream the results or not. 26 | - other fields: the sampling parameters (See `SamplingParams` for details). 27 | """ 28 | request_dict = await request.json() 29 | prompt = request_dict.pop("prompt") 30 | stream = request_dict.pop("stream", False) 31 | sampling_params = SamplingParams(**request_dict) 32 | request_id = random_uuid() 33 | results_generator = engine.generate(prompt, sampling_params, request_id) 34 | 35 | # Streaming case 36 | async def stream_results() -> AsyncGenerator[bytes, None]: 37 | async for request_output in results_generator: 38 | prompt = request_output.prompt 39 | text_outputs = [ 40 | prompt + output.text for output in request_output.outputs 41 | ] 42 | ret = {"text": text_outputs} 43 | yield (json.dumps(ret) + "\0").encode("utf-8") 44 | 45 | async def abort_request() -> None: 46 | await engine.abort(request_id) 47 | 48 | if stream: 49 | background_tasks = BackgroundTasks() 50 | # Abort the request if the client disconnects. 51 | background_tasks.add_task(abort_request) 52 | return StreamingResponse(stream_results(), background=background_tasks) 53 | 54 | # Non-streaming case 55 | final_output = None 56 | async for request_output in results_generator: 57 | if await request.is_disconnected(): 58 | # Abort the request if the client disconnects. 59 | await engine.abort(request_id) 60 | return Response(status_code=499) 61 | final_output = request_output 62 | 63 | assert final_output is not None 64 | prompt = final_output.prompt 65 | text_outputs = [prompt + output.text for output in final_output.outputs] 66 | ret = {"text": text_outputs} 67 | return JSONResponse(ret) 68 | 69 | 70 | if __name__ == "__main__": 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument("--host", type=str, default="0.0.0.0") 73 | parser.add_argument("--port", type=int, default=8090) 74 | parser.add_argument("--trust_remote_code", type=bool, default=True) 75 | parser = AsyncEngineArgs.add_cli_args(parser) 76 | args = parser.parse_args() 77 | 78 | engine_args = AsyncEngineArgs.from_cli_args(args) 79 | engine = AsyncLLMEngine.from_engine_args(engine_args) 80 | 81 | uvicorn.run(app, 82 | host=args.host, 83 | port=args.port, 84 | log_level="debug", 85 | timeout_keep_alive=TIMEOUT_KEEP_ALIVE) 86 | -------------------------------------------------------------------------------- /inference-speed/GPU/vllm_example/client_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import time 4 | import argparse 5 | 6 | import urllib.request 7 | 8 | import sys 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--model_source', default="llama_chinese", choices =["llama_chinese", "llama2_meta", "llama3_meta"], required=False,type=str) 12 | args = parser.parse_args() 13 | 14 | def get_prompt_llama_chinese( 15 | chat_history, system_prompt="" 16 | ) -> str: 17 | prompt = '' 18 | for input_text_one in chat_history: 19 | prompt += ""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n" 20 | if chat_history[-1]['role']=='Human': 21 | prompt += "Assistant: " 22 | else: 23 | prompt += "Human: " 24 | prompt = prompt[-2048:] 25 | if len(system_prompt)>0: 26 | prompt = 'System: '+system_prompt.strip()+'\n'+prompt 27 | 28 | return prompt 29 | 30 | def get_prompt_llama2_meta(chat_history, system_prompt=""): 31 | B_INST, E_INST = "[INST]", "[/INST]" 32 | B_SYS, E_SYS = "<>\n", "\n<>\n\n" 33 | 34 | sep = " " 35 | sep2 =" " 36 | stop_token_ids = [2] 37 | system_template = f"[INST] <>\n{system_prompt}\n<>\n\n" 38 | roles = ("[INST]", "[/INST]") 39 | seps = [sep, sep2] 40 | if system_prompt.strip() != "": 41 | ret = system_template 42 | else: 43 | ret = "[INST] " 44 | for i, chat in enumerate(chat_history): 45 | message = chat["content"] 46 | role = chat["role"] 47 | if message: 48 | if i == 0: 49 | ret += message + " " 50 | else: 51 | if role == "Human": 52 | ret += "[INST]" + " " + message + seps[i % 2] 53 | else: 54 | ret += "[/INST]" + " " + message + seps[i % 2] 55 | else: 56 | if role == "Human": 57 | ret += "[INST]" 58 | else: 59 | ret += "[/INST]" 60 | print("prompt:{}".format(ret)) 61 | return ret 62 | 63 | def get_prompt_llama3_meta(chat_history, system_prompt=""): 64 | system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>' 65 | user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>' 66 | assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n' 67 | prompt_str = '' 68 | # 拼接历史对话 69 | for item in chat_history: 70 | if item['role']=='Human': 71 | prompt_str+=user_format.format(content=item['content']) 72 | else: 73 | prompt_str+=assistant_format.format(content=item['content']) 74 | if len(system_prompt)>0: 75 | prompt_str = system_format.format(content=system_prompt) + prompt_str 76 | prompt_str = "<|begin_of_text|>" + prompt_str 77 | return prompt_str 78 | 79 | 80 | def test_api_server(chat_history=[], system_prompt=""): 81 | header = {'Content-Type': 'application/json'} 82 | 83 | if args.model_source == "llama2_meta": 84 | prompt = get_prompt_llama2_meta(chat_history, system_prompt) 85 | elif args.model_source == "llama3_meta": 86 | prompt = get_prompt_llama3_meta(chat_history, system_prompt) 87 | else: 88 | prompt = get_prompt_llama_chinese(chat_history, system_prompt) 89 | 90 | data = { 91 | "prompt": prompt, 92 | "stream" : False, 93 | "n" : 1, 94 | "best_of": 1, 95 | "presence_penalty": 0.0, 96 | "frequency_penalty": 0.2, 97 | "temperature": 0.3, 98 | "top_p" : 0.95, 99 | "top_k": 50, 100 | "use_beam_search": False, 101 | "stop": [], 102 | "ignore_eos" :False, 103 | "max_tokens": 2048, 104 | "logprobs": None 105 | } 106 | request = urllib.request.Request( 107 | url='http://127.0.0.1:8090/generate', 108 | headers=header, 109 | data=json.dumps(data).encode('utf-8') 110 | ) 111 | 112 | result = None 113 | try: 114 | response = urllib.request.urlopen(request, timeout=300) 115 | res = response.read().decode('utf-8') 116 | result = json.loads(res) 117 | print(json.dumps(data, ensure_ascii=False, indent=2)) 118 | print(json.dumps(result, ensure_ascii=False, indent=2)) 119 | 120 | except Exception as e: 121 | print(e) 122 | 123 | return result 124 | 125 | if __name__ == "__main__": 126 | # 多伦对话测试 127 | """ 多伦对话测试 128 | last_question = "怎么回来呢" 129 | inputs = [{"role": "Human", "content": "如何去北京"}, 130 | {"role": "Assitant", "content": "乘坐飞机或者轮船"}, 131 | {"role" : "Human", "content": last_question}] 132 | """ 133 | # 单轮对话 134 | last_question = "怎么去北京" 135 | chat_history = [ {"role" : "Human", "content": last_question}] 136 | test_api_server(chat_history) 137 | 138 | -------------------------------------------------------------------------------- /inference-speed/GPU/vllm_example/multi_gpus_api_server.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 python api_server.py \ 2 | --model "./Atom-7B-Chat" \ 3 | --port 8090 \ 4 | --tensor-parallel-size 2 5 | -------------------------------------------------------------------------------- /inference-speed/GPU/vllm_example/single_gpu_api_server.sh: -------------------------------------------------------------------------------- 1 | 2 | CUDA_VISIBLE_DEVICES=0 python api_server.py \ 3 | --model "./Atom-7B-Chat" \ 4 | --port 8090 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.1.2 2 | bitsandbytes==0.42.0 3 | accelerate==0.27.2 4 | numpy==1.26.4 5 | gekko==1.0.6 6 | pandas 7 | scipy 8 | sentencepiece==0.2.0 9 | datasets 10 | evaluate 11 | pytest 12 | peft==0.8.2 13 | transformers==4.39.0 14 | deepspeed==0.14.0 15 | scikit-learn 16 | torchvision 17 | torchdata 18 | torchaudio 19 | tensorboard 20 | gradio 21 | packaging -------------------------------------------------------------------------------- /scripts/api/README.md: -------------------------------------------------------------------------------- 1 | # API 调用 2 | 3 | ``` 4 | 您可以选择具体想部署的模型下载 5 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat 6 | mv Atom-7B-Chat /path/origin_model 7 | ``` 8 | 9 | 首先需要安装额外的依赖 `pip install fastapi uvicorn`,然后运行仓库中的 [accelerate_server.py](accelerate_server.py): 10 | 11 | ```bash 12 | python accelerate_server.py \ 13 | --model_path /path/origin_model \ 14 | --gpus "0" \ 15 | --infer_dtype "int8" \ 16 | --model_source "llama2_chinese" 17 | ``` 18 | 参数说明: 19 | - model_path 模型的本地路径 20 | - gpus 使用的显卡编号,类似"0"、 "0,1" 21 | - infer_dtype 模型加载后的参数数据类型,可以是 int8, float16 22 | - model_source 模型的源,可以是llama2_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分,如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama2_chinese。 23 | 24 | 25 | 默认部署在本地的 8001 端口,通过 POST 方法进行调用 26 | 27 | ```bash 28 | python accelerate_client.py 29 | ``` 30 | -------------------------------------------------------------------------------- /scripts/api/accelerate_client.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import time 4 | import urllib.request 5 | import sys 6 | 7 | def test_api_server(input_text): 8 | header = {'Content-Type': 'application/json'} 9 | 10 | data = { 11 | "system_prompt": "", 12 | "history": inputs, 13 | "n" : 1, 14 | "best_of": 1, 15 | "presence_penalty": 1.2, 16 | "frequency_penalty": 0.2, 17 | "temperature": 0.3, 18 | "top_p" : 0.95, 19 | "top_k": 50, 20 | "use_beam_search": False, 21 | "stop": [], 22 | "ignore_eos" :False, 23 | "logprobs": None, 24 | "max_new_tokens": 2048, 25 | } 26 | request = urllib.request.Request( 27 | url='http://127.0.0.1:8001/generate', 28 | headers=header, 29 | data=json.dumps(data).encode('utf-8') 30 | ) 31 | 32 | result = None 33 | try: 34 | response = urllib.request.urlopen(request, timeout=300) 35 | res = response.read().decode('utf-8') 36 | result = json.loads(res) 37 | print(json.dumps(data, ensure_ascii=False, indent=2)) 38 | print(json.dumps(result, ensure_ascii=False, indent=2)) 39 | 40 | except Exception as e: 41 | print(e) 42 | 43 | return result 44 | 45 | if __name__ == "__main__": 46 | 47 | # 多伦对话测试 48 | """ 多伦对话测试 49 | last_question = "怎么回来呢" 50 | inputs = [{"role": "Human", "content": "如何去北京"}, 51 | {"role": "Assitant", "content": "乘坐飞机或者轮船"}, 52 | {"role" : "Human", "content": last_question}] 53 | """ 54 | # 单轮对话 55 | last_question = "怎么去北京" 56 | inputs = [ {"role" : "Human", "content": last_question}] 57 | 58 | test_api_server(inputs) 59 | 60 | -------------------------------------------------------------------------------- /scripts/api/accelerate_server.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | import gc 4 | import math 5 | import os 6 | import time 7 | 8 | from fastapi import FastAPI, Request 9 | from transformers import AutoTokenizer, AutoModel 10 | import uvicorn, json, datetime 11 | import torch 12 | import torch.distributed as dist 13 | 14 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--model_path',required=True,type=str) 18 | parser.add_argument('--gpus', default="0", type=str) 19 | parser.add_argument('--infer_dtype', default="int8", choices=["int4", "int8", "float16"], required=False,type=str) 20 | parser.add_argument('--model_source', default="llama2_chinese", choices =["llama2_chinese", "llama2_meta", "llama3_meta"], required=False,type=str) 21 | 22 | args = parser.parse_args() 23 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus 24 | 25 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 26 | world_size = torch.cuda.device_count() 27 | 28 | rank = local_rank 29 | 30 | app = FastAPI() 31 | 32 | def get_prompt_llama2chinese( 33 | chat_history, system_prompt="" 34 | ) -> str: 35 | prompt = '' 36 | for input_text_one in chat_history: 37 | prompt += ""+input_text_one['role']+": "+input_text_one['content'].strip()+"\n" 38 | if chat_history[-1]['role']=='Human': 39 | prompt += "Assistant: " 40 | else: 41 | prompt += "Human: " 42 | prompt = prompt[-2048:] 43 | if len(system_prompt)>0: 44 | prompt = 'System: '+system_prompt.strip()+'\n'+prompt 45 | 46 | return prompt 47 | 48 | def get_prompt(chat_history, system_prompt=""): 49 | B_INST, E_INST = "[INST]", "[/INST]" 50 | B_SYS, E_SYS = "<>\n", "\n<>\n\n" 51 | 52 | sep = " " 53 | sep2 =" " 54 | stop_token_ids = [2] 55 | system_template = f"[INST] <>\n{system_prompt}\n<>\n\n" 56 | roles = ("[INST]", "[/INST]") 57 | seps = [sep, sep2] 58 | if system_prompt.strip() != "": 59 | ret = system_template 60 | else: 61 | ret = "[INST] " 62 | for i, chat in enumerate(chat_history): 63 | message = chat["content"] 64 | role = chat["role"] 65 | if message: 66 | if i == 0: 67 | ret += message + " " 68 | else: 69 | if role == "Human": 70 | ret += "[INST]" + " " + message + seps[i % 2] 71 | else: 72 | ret += "[/INST]" + " " + message + seps[i % 2] 73 | else: 74 | if role == "Human": 75 | ret += "[INST]" 76 | else: 77 | ret += "[/INST]" 78 | print("prompt:{}".format(ret)) 79 | return ret 80 | 81 | def get_prompt_llama3(chat_history, system_prompt=""): 82 | system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>' 83 | user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>' 84 | assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n' 85 | prompt_str = '' 86 | # 拼接历史对话 87 | for item in chat_history: 88 | if item['role']=='Human': 89 | prompt_str+=user_format.format(content=item['content']) 90 | else: 91 | prompt_str+=assistant_format.format(content=item['content']) 92 | if len(system_prompt)>0: 93 | prompt_str = system_format.format(content=system_prompt) + prompt_str 94 | prompt_str = "<|begin_of_text|>" + prompt_str 95 | return prompt_str 96 | 97 | 98 | @app.post("/generate") 99 | async def create_item(request: Request): 100 | global model, tokenizer 101 | json_post_raw = await request.json() 102 | json_post = json.dumps(json_post_raw) 103 | json_post_list = json.loads(json_post) 104 | history = json_post_list.get('history') 105 | system_prompt = json_post_list.get('system_prompt') 106 | max_new_tokens = json_post_list.get('max_new_tokens') 107 | top_p = json_post_list.get('top_p') 108 | temperature = json_post_list.get('temperature') 109 | 110 | if args.model_source == "llama2_meta": 111 | prompt = get_prompt(history, system_prompt) 112 | elif args.model_source == "llama3_meta": 113 | prompt = get_prompt_llama3(history, system_prompt) 114 | else: 115 | prompt = get_prompt_llama2chinese(history, system_prompt) 116 | 117 | inputs = tokenizer([prompt], return_tensors='pt').to("cuda") 118 | generate_kwargs = dict( 119 | inputs, 120 | # streamer=streamer, 121 | max_new_tokens=max_new_tokens, 122 | do_sample=True, 123 | top_p=top_p, 124 | top_k=50, 125 | temperature=temperature, 126 | num_beams=1, 127 | repetition_penalty=1.2, 128 | max_length=2048, 129 | ) 130 | generate_ids = model.generate(**generate_kwargs) 131 | 132 | generate_ids = [item[len(inputs[0]):-1] for item in generate_ids] 133 | 134 | bot_message = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 135 | if 'Human:' in bot_message: 136 | bot_message = bot_message.split('Human:')[0] 137 | 138 | now = datetime.datetime.now() 139 | time = now.strftime("%Y-%m-%d %H:%M:%S") 140 | answer = { 141 | "response": bot_message, 142 | "status": 200, 143 | "time": time 144 | } 145 | return answer 146 | 147 | def get_world_size() -> int: 148 | if dist.is_initialized(): 149 | return dist.get_world_size() 150 | else: 151 | return 1 152 | 153 | def print_rank0(*msg): 154 | if rank != 0: 155 | return 156 | print(*msg) 157 | 158 | 159 | if __name__ == '__main__': 160 | dtype = torch.float16 161 | kwargs = dict( 162 | device_map="auto", 163 | ) 164 | print("get_world_size:{}".format(get_world_size())) 165 | 166 | infer_dtype = args.infer_dtype 167 | if infer_dtype not in ["int4", "int8", "float16"]: 168 | raise ValueError("infer_dtype must one of int4, int8 or float16") 169 | 170 | if get_world_size() > 1: 171 | kwargs["device_map"] = "balanced_low_0" 172 | 173 | if infer_dtype == "int8": 174 | print_rank0("Using `load_in_8bit=True` to use quanitized model") 175 | kwargs["load_in_8bit"] = True 176 | else: 177 | kwargs["torch_dtype"] = dtype 178 | 179 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) 180 | if infer_dtype in ["int8", "float16"]: 181 | model = AutoModelForCausalLM.from_pretrained(args.model_path, **kwargs,trust_remote_code=True,use_flash_attention_2=True) 182 | elif infer_dtype == "int4": 183 | from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model 184 | model = AutoGPTQForCausalLM.from_quantized( 185 | args.model_path, device="cuda:0", 186 | use_triton=False, 187 | low_cpu_mem_usage=True, 188 | # inject_fused_attention=False, 189 | # inject_fused_mlp=False 190 | ) 191 | 192 | model.eval() 193 | uvicorn.run(app, host='0.0.0.0', port=8001, workers=1) 194 | -------------------------------------------------------------------------------- /scripts/convert2hf/README.md: -------------------------------------------------------------------------------- 1 | ## Meta官网模型权重转换成Hugging Face格式 2 | 3 | 使用脚本 4 | ```bash 5 | python convert_llama_weights_to_hf.py \ 6 | --input_dir /path/to/downloaded/llama/weights \ 7 | --model_size 7B \ 8 | --output_dir /output/path 9 | ``` 10 | 11 | 通过脚本转换后的模型权重可以使用transformers进行加载,例如: 12 | 13 | ```py 14 | from transformers import LlamaForCausalLM, LlamaTokenizer 15 | 16 | model = LlamaForCausalLM.from_pretrained("/output/path") 17 | tokenizer = LlamaTokenizer.from_pretrained("/output/path") 18 | ``` -------------------------------------------------------------------------------- /scripts/convert2hf/convert_llama_weights_to_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | import gc 16 | import json 17 | import os 18 | import shutil 19 | import warnings 20 | 21 | import torch 22 | 23 | from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer 24 | 25 | 26 | try: 27 | from transformers import LlamaTokenizerFast 28 | except ImportError as e: 29 | warnings.warn(e) 30 | warnings.warn( 31 | "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" 32 | ) 33 | LlamaTokenizerFast = None 34 | 35 | """ 36 | Sample usage: 37 | 38 | ``` 39 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \ 40 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path 41 | ``` 42 | 43 | Thereafter, models can be loaded via: 44 | 45 | ```py 46 | from transformers import LlamaForCausalLM, LlamaTokenizer 47 | 48 | model = LlamaForCausalLM.from_pretrained("/output/path") 49 | tokenizer = LlamaTokenizer.from_pretrained("/output/path") 50 | ``` 51 | 52 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions 53 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). 54 | """ 55 | 56 | INTERMEDIATE_SIZE_MAP = { 57 | "7B": 11008, 58 | "13B": 13824, 59 | "30B": 17920, 60 | "65B": 22016, 61 | "70B": 28672, 62 | } 63 | NUM_SHARDS = { 64 | "7B": 1, 65 | "7Bf": 1, 66 | "13B": 2, 67 | "13Bf": 2, 68 | "30B": 4, 69 | "65B": 8, 70 | "70B": 8, 71 | "70Bf": 8, 72 | } 73 | 74 | 75 | def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): 76 | return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) 77 | 78 | 79 | def read_json(path): 80 | with open(path, "r") as f: 81 | return json.load(f) 82 | 83 | 84 | def write_json(text, path): 85 | with open(path, "w") as f: 86 | json.dump(text, f) 87 | 88 | 89 | def write_model(model_path, input_base_path, model_size, safe_serialization=True): 90 | os.makedirs(model_path, exist_ok=True) 91 | tmp_model_path = os.path.join(model_path, "tmp") 92 | os.makedirs(tmp_model_path, exist_ok=True) 93 | 94 | params = read_json(os.path.join(input_base_path, "params.json")) 95 | num_shards = NUM_SHARDS[model_size] 96 | n_layers = params["n_layers"] 97 | n_heads = params["n_heads"] 98 | n_heads_per_shard = n_heads // num_shards 99 | dim = params["dim"] 100 | dims_per_head = dim // n_heads 101 | base = 10000.0 102 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) 103 | 104 | if "n_kv_heads" in params: 105 | num_key_value_heads = params["n_kv_heads"] # for GQA / MQA 106 | num_local_key_value_heads = n_heads_per_shard // num_key_value_heads 107 | key_value_dim = dim // num_key_value_heads 108 | else: # compatibility with other checkpoints 109 | num_key_value_heads = n_heads 110 | num_local_key_value_heads = n_heads_per_shard 111 | key_value_dim = dim 112 | 113 | # permute for sliced rotary 114 | def permute(w, n_heads=n_heads, dim1=dim, dim2=dim): 115 | return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 116 | 117 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.") 118 | # Load weights 119 | if model_size == "7B": 120 | # Not sharded 121 | # (The sharded implementation would also work, but this is simpler.) 122 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") 123 | else: 124 | # Sharded 125 | loaded = [ 126 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") 127 | for i in range(num_shards) 128 | ] 129 | param_count = 0 130 | index_dict = {"weight_map": {}} 131 | for layer_i in range(n_layers): 132 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" 133 | if model_size == "7B": 134 | # Unsharded 135 | state_dict = { 136 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( 137 | loaded[f"layers.{layer_i}.attention.wq.weight"] 138 | ), 139 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( 140 | loaded[f"layers.{layer_i}.attention.wk.weight"] 141 | ), 142 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], 143 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], 144 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], 145 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], 146 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], 147 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], 148 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], 149 | } 150 | else: 151 | # Sharded 152 | # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share 153 | # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is 154 | # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. 155 | 156 | state_dict = { 157 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ 158 | f"layers.{layer_i}.attention_norm.weight" 159 | ].clone(), 160 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ 161 | f"layers.{layer_i}.ffn_norm.weight" 162 | ].clone(), 163 | } 164 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( 165 | torch.cat( 166 | [ 167 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) 168 | for i in range(num_shards) 169 | ], 170 | dim=0, 171 | ).reshape(dim, dim) 172 | ) 173 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( 174 | torch.cat( 175 | [ 176 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( 177 | num_local_key_value_heads, dims_per_head, dim 178 | ) 179 | for i in range(num_shards) 180 | ], 181 | dim=0, 182 | ).reshape(key_value_dim, dim), 183 | num_key_value_heads, 184 | key_value_dim, 185 | dim, 186 | ) 187 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( 188 | [ 189 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( 190 | num_local_key_value_heads, dims_per_head, dim 191 | ) 192 | for i in range(num_shards) 193 | ], 194 | dim=0, 195 | ).reshape(key_value_dim, dim) 196 | 197 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( 198 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 199 | ) 200 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( 201 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 202 | ) 203 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( 204 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 205 | ) 206 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( 207 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 208 | ) 209 | 210 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq 211 | for k, v in state_dict.items(): 212 | index_dict["weight_map"][k] = filename 213 | param_count += v.numel() 214 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 215 | 216 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" 217 | if model_size == "7B": 218 | # Unsharded 219 | state_dict = { 220 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"], 221 | "model.norm.weight": loaded["norm.weight"], 222 | "lm_head.weight": loaded["output.weight"], 223 | } 224 | else: 225 | state_dict = { 226 | "model.norm.weight": loaded[0]["norm.weight"], 227 | "model.embed_tokens.weight": torch.cat( 228 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 229 | ), 230 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), 231 | } 232 | 233 | for k, v in state_dict.items(): 234 | index_dict["weight_map"][k] = filename 235 | param_count += v.numel() 236 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 237 | 238 | # Write configs 239 | index_dict["metadata"] = {"total_size": param_count * 2} 240 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) 241 | ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 242 | multiple_of = params["multiple_of"] if "multiple_of" in params else 256 243 | config = LlamaConfig( 244 | hidden_size=dim, 245 | intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), 246 | num_attention_heads=params["n_heads"], 247 | num_hidden_layers=params["n_layers"], 248 | rms_norm_eps=params["norm_eps"], 249 | num_key_value_heads=num_key_value_heads, 250 | ) 251 | config.save_pretrained(tmp_model_path) 252 | 253 | # Make space so we can load the model properly now. 254 | del state_dict 255 | del loaded 256 | gc.collect() 257 | 258 | print("Loading the checkpoint in a Llama model.") 259 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 260 | # Avoid saving this as part of the config. 261 | del model.config._name_or_path 262 | 263 | print("Saving in the Transformers format.") 264 | model.save_pretrained(model_path, safe_serialization=safe_serialization) 265 | shutil.rmtree(tmp_model_path) 266 | 267 | 268 | def write_tokenizer(tokenizer_path, input_tokenizer_path): 269 | # Initialize the tokenizer based on the `spm` model 270 | tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast 271 | print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") 272 | tokenizer = tokenizer_class(input_tokenizer_path) 273 | tokenizer.save_pretrained(tokenizer_path) 274 | 275 | 276 | def main(): 277 | parser = argparse.ArgumentParser() 278 | parser.add_argument( 279 | "--input_dir", 280 | help="Location of LLaMA weights, which contains tokenizer.model and model folders", 281 | ) 282 | parser.add_argument( 283 | "--model_size", 284 | choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"], 285 | help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama", 286 | ) 287 | parser.add_argument( 288 | "--output_dir", 289 | help="Location to write HF model and tokenizer", 290 | ) 291 | parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") 292 | args = parser.parse_args() 293 | if args.model_size != "tokenizer_only": 294 | write_model( 295 | model_path=args.output_dir, 296 | # input_base_path=os.path.join(args.input_dir, args.model_size), 297 | input_base_path=args.input_dir, 298 | model_size=args.model_size, 299 | safe_serialization=args.safe_serialization, 300 | ) 301 | spm_path = os.path.join(args.input_dir, "tokenizer.model") 302 | write_tokenizer(args.output_dir, spm_path) 303 | 304 | 305 | if __name__ == "__main__": 306 | main() 307 | 308 | -------------------------------------------------------------------------------- /scripts/test_model/test_pretrain_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n", 11 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", 12 | "from transformers import AutoTokenizer,AutoModelForCausalLM\n", 13 | "import torch\n", 14 | "model = AutoModelForCausalLM.from_pretrained('/mnt/nvme3n1/model_public/Atom1B/checkpoint-480000',torch_dtype=torch.float16,device_map='auto',trust_remote_code=True)\n", 15 | "tokenizer = AutoTokenizer.from_pretrained('/mnt/nvme3n1/model_public/Atom1B/checkpoint-480000',use_fast=False)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "input_ids = tokenizer(['''Human: 介绍一下北京\\nAssistant: '''], return_tensors=\"pt\",add_special_tokens=False).input_ids.to('cuda') \n", 25 | "print(input_ids) " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "generate_input = {\n", 35 | " \"input_ids\":input_ids,\n", 36 | " \"max_new_tokens\":10,\n", 37 | " \"do_sample\":True,\n", 38 | " \"top_k\":50,\n", 39 | " \"top_p\":0.95,\n", 40 | " \"temperature\":1,\n", 41 | " \"repetition_penalty\":1.0,\n", 42 | " \"eos_token_id\":tokenizer.eos_token_id,\n", 43 | " \"bos_token_id\":tokenizer.bos_token_id,\n", 44 | " \"pad_token_id\":tokenizer.pad_token_id\n", 45 | "}\n", 46 | "generate_ids = model.generate(**generate_input)\n", 47 | "text = tokenizer.decode(generate_ids[0])\n", 48 | "print(text)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# checkpoint-100 的模型输出\n", 56 | "\n", 57 | "# checkpoint-5000 的模型输出\n" 58 | ] 59 | } 60 | ], 61 | "metadata": { 62 | "language_info": { 63 | "name": "python" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 2 68 | } 69 | -------------------------------------------------------------------------------- /train/merge_peft_model/merge.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python merge_peft_adapter.py \ 2 | --adapter_model_name /checkpoint-2200 \ 3 | --output_name checkpoint-2200_merge \ 4 | --load8bit false \ 5 | --tokenizer_fast false -------------------------------------------------------------------------------- /train/merge_peft_model/merge_muilt.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=3 python merge_muilt_peft_adapter.py \ 2 | --adapter_model_name checkpoint-8000 \ 3 | checkpoint-140 \ 4 | --output_name checkpoint-140-8000_merge -------------------------------------------------------------------------------- /train/merge_peft_model/merge_muilt_peft_adapter.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional,List 3 | 4 | import peft 5 | import torch 6 | from peft import PeftConfig, PeftModel,PeftModelForSequenceClassification 7 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser,AutoModelForSequenceClassification 8 | from peft.utils import _get_submodules 9 | 10 | @dataclass 11 | class ScriptArguments: 12 | """ 13 | The name of the Casual LM model we wish to fine with PPO 14 | """ 15 | 16 | adapter_model_name: Optional[List[str]] = field(default=None, metadata={"help": "the model name"}) 17 | output_name: Optional[str] = field(default=None, metadata={"help": "the model name"}) 18 | 19 | 20 | parser = HfArgumentParser(ScriptArguments) 21 | script_args = parser.parse_args_into_dataclasses()[0] 22 | 23 | base_model = None 24 | for one_lora_path in script_args.adapter_model_name: 25 | if base_model==None: 26 | peft_config = PeftConfig.from_pretrained(one_lora_path) 27 | tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path) 28 | tokenizer.save_pretrained(f"{script_args.output_name}") 29 | base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, torch_dtype=torch.bfloat16) 30 | peft_config = PeftConfig.from_pretrained(one_lora_path) 31 | base_model = PeftModel.from_pretrained(base_model, one_lora_path,device_map={"": 0}) 32 | # model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, device_map='auto',load_in_8bit=True) 33 | # Load the Lora model 34 | base_model = base_model.merge_and_unload() 35 | base_model.eval() 36 | 37 | 38 | 39 | 40 | # key_list = [key for key, _ in model.base_model.model.named_modules() if "lora" not in key] 41 | # for key in key_list: 42 | # print(key) 43 | # parent, target, target_name = _get_submodules(model.base_model,key) 44 | # if isinstance(target, peft.tuners.lora.Linear): 45 | # print('peft.tuners.lora.Linear') 46 | # bias = target.bias is not None 47 | # new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias) 48 | # model.base_model._replace_module(parent, target_name, new_module, target) 49 | 50 | # model = model.base_model.model 51 | 52 | 53 | base_model.save_pretrained(f"{script_args.output_name}") 54 | # model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False) -------------------------------------------------------------------------------- /train/merge_peft_model/merge_peft_adapter.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | import peft 5 | import torch 6 | from peft import PeftConfig, PeftModel,PeftModelForSequenceClassification 7 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser,AutoModelForSequenceClassification 8 | from peft.utils import _get_submodules 9 | 10 | @dataclass 11 | class ScriptArguments: 12 | """ 13 | The name of the Casual LM model we wish to fine with PPO 14 | """ 15 | 16 | adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the model name"}) 17 | load8bit : Optional[bool] = field(default=None, metadata={"help": "the model type"}) 18 | output_name: Optional[str] = field(default=None, metadata={"help": "the model name"}) 19 | tokenizer_fast:Optional[bool] = field(default=None, metadata={"help": "the model type"}) 20 | 21 | 22 | parser = HfArgumentParser(ScriptArguments) 23 | script_args = parser.parse_args_into_dataclasses()[0] 24 | 25 | 26 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name) 27 | model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16,device_map='auto',trust_remote_code=True) 28 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name,device_map='auto') 29 | tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path,use_fast=script_args.tokenizer_fast) 30 | config = AutoConfig.from_pretrained(peft_config.base_model_name_or_path) 31 | architecture = config.architectures[0] 32 | print(architecture) 33 | # Load the Lora model 34 | model = model.merge_and_unload() 35 | model.eval() 36 | 37 | 38 | model.save_pretrained(f"{script_args.output_name}") 39 | tokenizer.save_pretrained(f"{script_args.output_name}") 40 | if script_args.load8bit: 41 | model = AutoModelForCausalLM.from_pretrained(script_args.output_name, torch_dtype=torch.float16,load_in_8bit=script_args.load8bit,device_map='auto',trust_remote_code=True) 42 | model.save_pretrained(f"{script_args.output_name}",max_shard_size='5GB') -------------------------------------------------------------------------------- /train/pretrain/accuracy.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Accuracy metric.""" 15 | 16 | import datasets 17 | from sklearn.metrics import accuracy_score 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: 24 | Accuracy = (TP + TN) / (TP + TN + FP + FN) 25 | Where: 26 | TP: True positive 27 | TN: True negative 28 | FP: False positive 29 | FN: False negative 30 | """ 31 | 32 | 33 | _KWARGS_DESCRIPTION = """ 34 | Args: 35 | predictions (`list` of `int`): Predicted labels. 36 | references (`list` of `int`): Ground truth labels. 37 | normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. 38 | sample_weight (`list` of `float`): Sample weights Defaults to None. 39 | 40 | Returns: 41 | accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy. 42 | 43 | Examples: 44 | 45 | Example 1-A simple example 46 | >>> accuracy_metric = evaluate.load("accuracy") 47 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) 48 | >>> print(results) 49 | {'accuracy': 0.5} 50 | 51 | Example 2-The same as Example 1, except with `normalize` set to `False`. 52 | >>> accuracy_metric = evaluate.load("accuracy") 53 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) 54 | >>> print(results) 55 | {'accuracy': 3.0} 56 | 57 | Example 3-The same as Example 1, except with `sample_weight` set. 58 | >>> accuracy_metric = evaluate.load("accuracy") 59 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) 60 | >>> print(results) 61 | {'accuracy': 0.8778625954198473} 62 | """ 63 | 64 | 65 | _CITATION = """ 66 | @article{scikit-learn, 67 | title={Scikit-learn: Machine Learning in {P}ython}, 68 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. 69 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. 70 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and 71 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, 72 | journal={Journal of Machine Learning Research}, 73 | volume={12}, 74 | pages={2825--2830}, 75 | year={2011} 76 | } 77 | """ 78 | 79 | 80 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 81 | class Accuracy(evaluate.Metric): 82 | def _info(self): 83 | return evaluate.MetricInfo( 84 | description=_DESCRIPTION, 85 | citation=_CITATION, 86 | inputs_description=_KWARGS_DESCRIPTION, 87 | features=datasets.Features( 88 | { 89 | "predictions": datasets.Sequence(datasets.Value("int32")), 90 | "references": datasets.Sequence(datasets.Value("int32")), 91 | } 92 | if self.config_name == "multilabel" 93 | else { 94 | "predictions": datasets.Value("int32"), 95 | "references": datasets.Value("int32"), 96 | } 97 | ), 98 | reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"], 99 | ) 100 | 101 | def _compute(self, predictions, references, normalize=True, sample_weight=None): 102 | return { 103 | "accuracy": float( 104 | accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight) 105 | ) 106 | } 107 | -------------------------------------------------------------------------------- /train/pretrain/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | 20 | "scheduler": { 21 | "type": "WarmupDecayLR", 22 | "params": { 23 | "last_batch_iteration": -1, 24 | "total_num_steps": "auto", 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | 31 | "zero_optimization": { 32 | "stage": 2, 33 | "offload_optimizer": { 34 | "device": "cpu", 35 | "pin_memory": true 36 | }, 37 | "offload_param": { 38 | "device": "cpu", 39 | "pin_memory": true 40 | }, 41 | "allgather_partitions": true, 42 | "allgather_bucket_size": 5e8, 43 | "overlap_comm": true, 44 | "reduce_scatter": true, 45 | "reduce_bucket_size": 5e8, 46 | "contiguous_gradients": true 47 | }, 48 | "activation_checkpointing": { 49 | "partition_activations": false, 50 | "cpu_checkpointing": false, 51 | "contiguous_memory_optimization": false, 52 | "number_checkpoints": null, 53 | "synchronize_checkpoint_boundary": false, 54 | "profile": false 55 | }, 56 | "gradient_accumulation_steps": "auto", 57 | "gradient_clipping": "auto", 58 | "steps_per_print": 2000, 59 | "train_batch_size": "auto", 60 | "min_lr": 5e-7, 61 | "train_micro_batch_size_per_gpu": "auto", 62 | "wall_clock_breakdown": false 63 | } -------------------------------------------------------------------------------- /train/pretrain/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1, 9 | "fp16_opt_level": "O2" 10 | }, 11 | 12 | "bf16": { 13 | "enabled": "auto" 14 | }, 15 | 16 | "optimizer": { 17 | "type": "AdamW", 18 | "params": { 19 | "lr": "auto", 20 | "betas": "auto", 21 | "eps": "auto", 22 | "weight_decay": "auto" 23 | } 24 | }, 25 | 26 | "scheduler": { 27 | "type": "WarmupDecayLR", 28 | "params": { 29 | "last_batch_iteration": -1, 30 | "total_num_steps": "auto", 31 | "warmup_min_lr": "auto", 32 | "warmup_max_lr": "auto", 33 | "warmup_num_steps": "auto" 34 | } 35 | }, 36 | 37 | "zero_optimization": { 38 | "stage": 3, 39 | "overlap_comm": true, 40 | "contiguous_gradients": true, 41 | "sub_group_size": 1e9, 42 | "reduce_bucket_size": "auto", 43 | "stage3_prefetch_bucket_size": "auto", 44 | "stage3_param_persistence_threshold": "auto", 45 | "stage3_max_live_parameters": 1e9, 46 | "stage3_max_reuse_distance": 1e9, 47 | "gather_16bit_weights_on_model_save": true 48 | }, 49 | "gradient_accumulation_steps": "auto", 50 | "gradient_clipping": "auto", 51 | "steps_per_print": 2000, 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "wall_clock_breakdown": false 55 | } -------------------------------------------------------------------------------- /train/pretrain/pretrain.sh: -------------------------------------------------------------------------------- 1 | output_model=output_model 2 | if [ ! -d ${output_model} ];then 3 | mkdir ${output_model} 4 | fi 5 | cp ./pretrain.sh ${output_model} 6 | cp ./ds_config_zero*.json ${output_model} 7 | export CUDA_HOME=/usr/local/cuda/ 8 | export NCCL_P2P_DISABLE=1 9 | 10 | deepspeed --include localhost:0,2 pretrain_clm.py \ 11 | --config_name ../../model_config/Atom-100M/config.json \ 12 | --tokenizer_name ../../model_config/Atom-100M \ 13 | --train_files ../../data/wiki_zh/train_lm_task_0.csv \ 14 | ../../data/wiki_zh/train_lm_task_1.csv \ 15 | --validation_files ../../data/wiki_zh/dev_lm_task.csv \ 16 | --per_device_train_batch_size 32 \ 17 | --per_device_eval_batch_size 32 \ 18 | --do_train \ 19 | --output_dir ${output_model} \ 20 | --evaluation_strategy steps \ 21 | --use_fast_tokenizer false \ 22 | --max_eval_samples 500 \ 23 | --learning_rate 1e-4 \ 24 | --gradient_accumulation_steps 2 \ 25 | --num_train_epochs 3 \ 26 | --warmup_steps 5000 \ 27 | --logging_dir ${output_model}/logs \ 28 | --logging_strategy steps \ 29 | --logging_steps 5 \ 30 | --save_strategy steps \ 31 | --preprocessing_num_workers 10 \ 32 | --save_steps 100 \ 33 | --eval_steps 5000000 \ 34 | --save_total_limit 2000 \ 35 | --seed 42 \ 36 | --disable_tqdm false \ 37 | --ddp_find_unused_parameters false \ 38 | --block_size 1024 \ 39 | --overwrite_output_dir \ 40 | --report_to tensorboard \ 41 | --run_name ${output_model} \ 42 | --bf16 \ 43 | --bf16_full_eval \ 44 | --gradient_checkpointing \ 45 | --deepspeed ./ds_config_zero3.json \ 46 | --ignore_data_skip true \ 47 | --ddp_timeout 18000000 \ 48 | | tee -a ${output_model}/train.log 49 | 50 | # --resume_from_checkpoint ${output_model}/checkpoint-20400 \ 51 | -------------------------------------------------------------------------------- /train/pretrain/pretrain_clm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. 18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 19 | https://huggingface.co/models?filter=text-generation 20 | """ 21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. 22 | 23 | import logging 24 | import math 25 | import os 26 | import sys 27 | from dataclasses import dataclass, field 28 | from torchdata.datapipes.iter import IterDataPipe, IterableWrapper 29 | from itertools import chain 30 | import deepspeed 31 | from typing import Optional,List 32 | 33 | import datasets 34 | import pandas as pd 35 | import evaluate 36 | import torch 37 | from datasets import load_dataset 38 | from datasets.combine import interleave_datasets 39 | import transformers 40 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 41 | from transformers import ( 42 | CONFIG_MAPPING, 43 | MODEL_FOR_CAUSAL_LM_MAPPING, 44 | AutoConfig, 45 | AutoModelForCausalLM, 46 | AutoTokenizer, 47 | TrainerCallback, 48 | TrainerState, 49 | TrainerControl, 50 | HfArgumentParser, 51 | Trainer, 52 | TrainingArguments, 53 | default_data_collator, 54 | is_torch_tpu_available, 55 | set_seed, 56 | ) 57 | import datetime 58 | from transformers.testing_utils import CaptureLogger 59 | from transformers.trainer_utils import get_last_checkpoint 60 | from transformers.utils import check_min_version, send_example_telemetry 61 | from transformers.utils.versions import require_version 62 | from datasets import interleave_datasets 63 | 64 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 65 | # check_min_version("4.27.0.dev0") 66 | 67 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") 68 | 69 | logger = logging.getLogger(__name__) 70 | 71 | 72 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) 73 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 74 | 75 | @dataclass 76 | class ModelArguments: 77 | """ 78 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 79 | """ 80 | 81 | model_name_or_path: Optional[str] = field( 82 | default=None, 83 | metadata={ 84 | "help": ( 85 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." 86 | ) 87 | }, 88 | ) 89 | model_type: Optional[str] = field( 90 | default=None, 91 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 92 | ) 93 | config_overrides: Optional[str] = field( 94 | default=None, 95 | metadata={ 96 | "help": ( 97 | "Override some existing default config settings when a model is trained from scratch. Example: " 98 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" 99 | ) 100 | }, 101 | ) 102 | config_name: Optional[str] = field( 103 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 104 | ) 105 | tokenizer_name: Optional[str] = field( 106 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 107 | ) 108 | cache_dir: Optional[str] = field( 109 | default=None, 110 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 111 | ) 112 | use_fast_tokenizer: bool = field( 113 | default=True, 114 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 115 | ) 116 | model_revision: str = field( 117 | default="main", 118 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 119 | ) 120 | use_auth_token: bool = field( 121 | default=False, 122 | metadata={ 123 | "help": ( 124 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script " 125 | "with private models)." 126 | ) 127 | }, 128 | ) 129 | torch_dtype: Optional[str] = field( 130 | default=None, 131 | metadata={ 132 | "help": ( 133 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " 134 | "dtype will be automatically derived from the model's weights." 135 | ), 136 | "choices": ["auto", "bfloat16", "float16", "float32"], 137 | }, 138 | ) 139 | 140 | def __post_init__(self): 141 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): 142 | raise ValueError( 143 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path" 144 | ) 145 | 146 | 147 | @dataclass 148 | class DataTrainingArguments: 149 | """ 150 | Arguments pertaining to what data we are going to input our model for training and eval. 151 | """ 152 | 153 | dataset_name: Optional[str] = field( 154 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 155 | ) 156 | dataset_config_name: Optional[str] = field( 157 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 158 | ) 159 | train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 160 | validation_files: Optional[List[str]] = field( 161 | default=None, 162 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 163 | ) 164 | max_train_samples: Optional[int] = field( 165 | default=None, 166 | metadata={ 167 | "help": ( 168 | "For debugging purposes or quicker training, truncate the number of training examples to this " 169 | "value if set." 170 | ) 171 | }, 172 | ) 173 | max_eval_samples: Optional[int] = field( 174 | default=None, 175 | metadata={ 176 | "help": ( 177 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 178 | "value if set." 179 | ) 180 | }, 181 | ) 182 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) 183 | block_size: Optional[int] = field( 184 | default=None, 185 | metadata={ 186 | "help": ( 187 | "Optional input sequence length after tokenization. " 188 | "The training dataset will be truncated in block of this size for training. " 189 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 190 | ) 191 | }, 192 | ) 193 | overwrite_cache: bool = field( 194 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 195 | ) 196 | validation_split_percentage: Optional[int] = field( 197 | default=5, 198 | metadata={ 199 | "help": "The percentage of the train set used as validation set in case there's no validation split" 200 | }, 201 | ) 202 | preprocessing_num_workers: Optional[int] = field( 203 | default=None, 204 | metadata={"help": "The number of processes to use for the preprocessing."}, 205 | ) 206 | keep_linebreaks: bool = field( 207 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} 208 | ) 209 | 210 | def __post_init__(self): 211 | if self.streaming: 212 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") 213 | 214 | if self.dataset_name is None and self.train_files is None and self.validation_files is None: 215 | raise ValueError("Need either a dataset name or a training/validation file.") 216 | else: 217 | if self.train_files is not None: 218 | extension = self.train_files[0].split(".")[-1] 219 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 220 | if self.validation_files is not None: 221 | extension = self.validation_files[0].split(".")[-1] 222 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." 223 | 224 | def main(): 225 | # See all possible arguments in src/transformers/training_args.py 226 | # or by passing the --help flag to this script. 227 | # We now keep distinct sets of args, for a cleaner separation of concerns. 228 | 229 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 230 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 231 | # If we pass only one argument to the script and it's the path to a json file, 232 | # let's parse it to get our arguments. 233 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 234 | else: 235 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 236 | 237 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The 238 | # information sent is the one passed as arguments along with your Python/PyTorch versions. 239 | send_example_telemetry("run_clm", model_args, data_args) 240 | 241 | # Setup logging 242 | logging.basicConfig( 243 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 244 | datefmt="%m/%d/%Y %H:%M:%S", 245 | handlers=[logging.StreamHandler(sys.stdout)], 246 | ) 247 | 248 | if training_args.should_log: 249 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 250 | transformers.utils.logging.set_verbosity_info() 251 | 252 | log_level = training_args.get_process_log_level() 253 | logger.setLevel(log_level) 254 | datasets.utils.logging.set_verbosity(log_level) 255 | transformers.utils.logging.set_verbosity(log_level) 256 | transformers.utils.logging.enable_default_handler() 257 | transformers.utils.logging.enable_explicit_format() 258 | 259 | # Log on each process the small summary: 260 | logger.warning( 261 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 262 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 263 | ) 264 | logger.info(f"Training/evaluation parameters {training_args}") 265 | 266 | # Detecting last checkpoint. 267 | last_checkpoint = None 268 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 269 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 270 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 271 | raise ValueError( 272 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 273 | "Use --overwrite_output_dir to overcome." 274 | ) 275 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 276 | logger.info( 277 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 278 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 279 | ) 280 | 281 | # Set seed before initializing model. 282 | set_seed(training_args.seed) 283 | 284 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 285 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 286 | # (the dataset will be downloaded automatically from the datasets Hub). 287 | # 288 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 289 | # 'text' is found. You can easily tweak this behavior (see below). 290 | # 291 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 292 | # download the dataset. 293 | if True: 294 | data_files = {} 295 | dataset_args = {} 296 | if data_args.train_files is not None: 297 | 298 | print(data_args.train_files) 299 | data_files["train"] = data_args.train_files 300 | print('训练文件总个数',len(data_args.train_files)) 301 | if data_args.validation_files is not None: 302 | data_files["validation"] = data_args.validation_files 303 | extension = ( 304 | data_files["train"][0].split(".")[-1] 305 | if data_files["train"] is not None 306 | else data_args.validation_files.split(".")[-1] 307 | ) 308 | if extension == "txt": 309 | extension = "text" 310 | dataset_args["keep_linebreaks"] = data_args.keep_linebreaks 311 | 312 | 313 | raw_datasets = load_dataset( 314 | extension, 315 | data_files=data_files, 316 | streaming=data_args.streaming, 317 | cache_dir=os.path.join(training_args.output_dir,'dataset_cache'), 318 | use_auth_token=True if model_args.use_auth_token else None, 319 | **dataset_args, 320 | ) 321 | if data_args.streaming: 322 | raw_datasets = raw_datasets.shuffle(seed=training_args.seed, buffer_size=1000000) 323 | # If no validation data is there, validation_split_percentage will be used to divide the dataset. 324 | if "validation" not in raw_datasets.keys(): 325 | raw_datasets["validation"] = load_dataset( 326 | extension, 327 | data_files=data_files, 328 | split=f"train[:{data_args.validation_split_percentage}%]", 329 | cache_dir=model_args.cache_dir, 330 | use_auth_token=True if model_args.use_auth_token else None, 331 | **dataset_args, 332 | ) 333 | raw_datasets["train"] = load_dataset( 334 | extension, 335 | data_files=data_files, 336 | split=f"train[{data_args.validation_split_percentage}%:]", 337 | cache_dir=model_args.cache_dir, 338 | use_auth_token=True if model_args.use_auth_token else None, 339 | **dataset_args, 340 | ) 341 | 342 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 343 | # https://huggingface.co/docs/datasets/loading_datasets.html. 344 | 345 | # Load pretrained model and tokenizer 346 | # 347 | # Distributed training: 348 | # The .from_pretrained methods guarantee that only one local process can concurrently 349 | # download model & vocab. 350 | 351 | config_kwargs = { 352 | "cache_dir": model_args.cache_dir, 353 | "revision": model_args.model_revision, 354 | "use_auth_token": True if model_args.use_auth_token else None, 355 | } 356 | if model_args.config_name: 357 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 358 | elif model_args.model_name_or_path: 359 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 360 | else: 361 | config = CONFIG_MAPPING[model_args.model_type]() 362 | logger.warning("You are instantiating a new config instance from scratch.") 363 | if model_args.config_overrides is not None: 364 | logger.info(f"Overriding config: {model_args.config_overrides}") 365 | config.update_from_string(model_args.config_overrides) 366 | logger.info(f"New config: {config}") 367 | 368 | print(training_args.local_rank,'start load tokenizer') 369 | tokenizer_kwargs = { 370 | "cache_dir": model_args.cache_dir, 371 | "use_fast": model_args.use_fast_tokenizer, 372 | "revision": model_args.model_revision, 373 | "use_auth_token": True if model_args.use_auth_token else None, 374 | } 375 | if model_args.tokenizer_name: 376 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 377 | elif model_args.model_name_or_path: 378 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 379 | else: 380 | raise ValueError( 381 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 382 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 383 | ) 384 | print(training_args.local_rank,'end load tokenizer') 385 | print(training_args.local_rank,'start load model') 386 | if model_args.model_name_or_path: 387 | torch_dtype = ( 388 | model_args.torch_dtype 389 | if model_args.torch_dtype in ["auto", None] 390 | else getattr(torch, model_args.torch_dtype) 391 | ) 392 | model = AutoModelForCausalLM.from_pretrained( 393 | model_args.model_name_or_path, 394 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 395 | config=config, 396 | cache_dir=model_args.cache_dir, 397 | revision=model_args.model_revision, 398 | trust_remote_code=True, 399 | use_flash_attention_2=True, 400 | use_auth_token=True if model_args.use_auth_token else None, 401 | ) 402 | else: 403 | model = AutoModelForCausalLM.from_config(config,trust_remote_code=True) 404 | n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) 405 | logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") 406 | print(training_args.local_rank,'end load model') 407 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch 408 | # on a small vocab and want a smaller embedding size, remove this test. 409 | embedding_size = model.get_input_embeddings().weight.shape[0] 410 | if len(tokenizer) > embedding_size: 411 | model.resize_token_embeddings(len(tokenizer)) 412 | # Preprocessing the datasets. 413 | # First we tokenize all the texts. 414 | if training_args.do_train: 415 | if data_args.streaming: 416 | dataset_head = raw_datasets["train"].take(3) 417 | print(list(dataset_head)) 418 | column_names = list(list(dataset_head)[0].keys()) 419 | else: 420 | column_names = list(raw_datasets["train"].features) 421 | else: 422 | if data_args.streaming: 423 | dataset_head = raw_datasets["validation"].take(3) 424 | column_names = list(list(dataset_head)[0].keys()) 425 | else: 426 | column_names = list(raw_datasets["validation"].features) 427 | print(column_names) 428 | text_column_name = "text" if "text" in column_names else column_names[0] 429 | 430 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function 431 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") 432 | 433 | def tokenize_function(examples): 434 | with CaptureLogger(tok_logger) as cl: 435 | output = tokenizer( [ item for item in examples[text_column_name]]) 436 | return output 437 | 438 | with training_args.main_process_first(desc="dataset map tokenization"): 439 | if not data_args.streaming: 440 | tokenized_datasets = raw_datasets.map( 441 | tokenize_function, 442 | batched=True, 443 | num_proc=data_args.preprocessing_num_workers, 444 | remove_columns=column_names, 445 | load_from_cache_file=not data_args.overwrite_cache, 446 | desc="Running tokenizer on dataset", 447 | ) 448 | else: 449 | tokenized_datasets = raw_datasets.map( 450 | tokenize_function, 451 | batched=True, 452 | remove_columns=column_names, 453 | batch_size = 60000, 454 | ) 455 | 456 | if data_args.block_size is None: 457 | block_size = tokenizer.model_max_length 458 | if block_size > 1024: 459 | logger.warning( 460 | "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" 461 | " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" 462 | " override this default with `--block_size xxx`." 463 | ) 464 | block_size = 1024 465 | else: 466 | if data_args.block_size > tokenizer.model_max_length: 467 | logger.warning( 468 | f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" 469 | f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." 470 | ) 471 | block_size = min(data_args.block_size, tokenizer.model_max_length) 472 | 473 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 474 | def group_texts(examples): 475 | # Concatenate all texts. 476 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} 477 | # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 478 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 479 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 480 | # customize this part to your needs. 481 | if total_length >= block_size: 482 | total_length = (total_length // block_size) * block_size 483 | # Split by chunks of max_len. 484 | result = { 485 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 486 | for k, t in concatenated_examples.items() 487 | } 488 | # print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 489 | logger.info("group texts input examples length%d after_group size%d"%(len(examples['input_ids']),len(result["input_ids"]))) 490 | result["labels"] = result["input_ids"].copy() 491 | return result 492 | 493 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder 494 | # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower 495 | # to preprocess. 496 | # 497 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 498 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 499 | 500 | with training_args.main_process_first(desc="grouping texts together"): 501 | if not data_args.streaming: 502 | lm_datasets = tokenized_datasets.map( 503 | group_texts, 504 | batched=True, 505 | num_proc=data_args.preprocessing_num_workers, 506 | load_from_cache_file=not data_args.overwrite_cache, 507 | desc=f"Grouping texts in chunks of {block_size}", 508 | batch_size = 40000, 509 | ) 510 | else: 511 | lm_datasets = tokenized_datasets.map( 512 | group_texts, 513 | batched=True, 514 | batch_size = 60000, 515 | ) 516 | print(training_args.local_rank,'start select train_dataset') 517 | if training_args.do_train: 518 | if "train" not in tokenized_datasets: 519 | raise ValueError("--do_train requires a train dataset") 520 | train_dataset = lm_datasets["train"] 521 | if data_args.max_train_samples is not None and data_args.streaming==False: 522 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 523 | train_dataset = train_dataset.select(range(max_train_samples)) 524 | print(training_args.local_rank,'end select train_dataset') 525 | 526 | if training_args.do_eval: 527 | if "validation" not in tokenized_datasets: 528 | raise ValueError("--do_eval requires a validation dataset") 529 | print(training_args.local_rank,'start select eval_dataset') 530 | eval_dataset = lm_datasets["validation"] 531 | if data_args.max_eval_samples is not None and data_args.streaming==False : 532 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 533 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 534 | print(training_args.local_rank,'end select eval_dataset') 535 | def preprocess_logits_for_metrics(logits, labels): 536 | if isinstance(logits, tuple): 537 | # Depending on the model and config, logits may contain extra tensors, 538 | # like past_key_values, but logits always come first 539 | logits = logits[0] 540 | return logits.argmax(dim=-1) 541 | print(training_args.local_rank,'start load metric') 542 | metric = evaluate.load("accuracy.py") 543 | print(training_args.local_rank,'end load metric') 544 | 545 | def compute_metrics(eval_preds): 546 | preds, labels = eval_preds 547 | # preds have the same shape as the labels, after the argmax(-1) has been calculated 548 | # by preprocess_logits_for_metrics but we need to shift the labels 549 | labels = labels[:, 1:].reshape(-1) 550 | preds = preds[:, :-1].reshape(-1) 551 | return metric.compute(predictions=preds, references=labels) 552 | 553 | print(training_args.local_rank,'Initialize our Trainer') 554 | trainer = Trainer( 555 | model=model, 556 | args=training_args, 557 | train_dataset= IterableWrapper(train_dataset) if training_args.do_train else None, 558 | eval_dataset= IterableWrapper(eval_dataset) if training_args.do_eval else None, 559 | tokenizer=tokenizer, 560 | # Data collator will default to DataCollatorWithPadding, so we change it. 561 | data_collator=default_data_collator, 562 | compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, 563 | preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None, 564 | # callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None), 565 | ) 566 | 567 | if training_args.do_train: 568 | checkpoint = None 569 | if training_args.resume_from_checkpoint is not None: 570 | checkpoint = training_args.resume_from_checkpoint 571 | elif last_checkpoint is not None: 572 | checkpoint = last_checkpoint 573 | 574 | print(training_args.local_rank,'start train') 575 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 576 | trainer.save_model() # Saves the tokenizer too for easy upload 577 | 578 | metrics = train_result.metrics 579 | 580 | max_train_samples = ( 581 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 582 | ) 583 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 584 | 585 | trainer.log_metrics("train", metrics) 586 | trainer.save_metrics("train", metrics) 587 | trainer.save_state() 588 | 589 | # Evaluation 590 | if training_args.do_eval: 591 | logger.info("*** Evaluate ***") 592 | 593 | metrics = trainer.evaluate() 594 | 595 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 596 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 597 | try: 598 | perplexity = math.exp(metrics["eval_loss"]) 599 | except OverflowError: 600 | perplexity = float("inf") 601 | metrics["perplexity"] = perplexity 602 | 603 | trainer.log_metrics("eval", metrics) 604 | trainer.save_metrics("eval", metrics) 605 | 606 | 607 | 608 | def _mp_fn(index): 609 | # For xla_spawn (TPUs) 610 | main() 611 | 612 | 613 | if __name__ == "__main__": 614 | main() 615 | -------------------------------------------------------------------------------- /train/sft/accuracy.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Accuracy metric.""" 15 | 16 | import datasets 17 | from sklearn.metrics import accuracy_score 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: 24 | Accuracy = (TP + TN) / (TP + TN + FP + FN) 25 | Where: 26 | TP: True positive 27 | TN: True negative 28 | FP: False positive 29 | FN: False negative 30 | """ 31 | 32 | 33 | _KWARGS_DESCRIPTION = """ 34 | Args: 35 | predictions (`list` of `int`): Predicted labels. 36 | references (`list` of `int`): Ground truth labels. 37 | normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. 38 | sample_weight (`list` of `float`): Sample weights Defaults to None. 39 | 40 | Returns: 41 | accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy. 42 | 43 | Examples: 44 | 45 | Example 1-A simple example 46 | >>> accuracy_metric = evaluate.load("accuracy") 47 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) 48 | >>> print(results) 49 | {'accuracy': 0.5} 50 | 51 | Example 2-The same as Example 1, except with `normalize` set to `False`. 52 | >>> accuracy_metric = evaluate.load("accuracy") 53 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) 54 | >>> print(results) 55 | {'accuracy': 3.0} 56 | 57 | Example 3-The same as Example 1, except with `sample_weight` set. 58 | >>> accuracy_metric = evaluate.load("accuracy") 59 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) 60 | >>> print(results) 61 | {'accuracy': 0.8778625954198473} 62 | """ 63 | 64 | 65 | _CITATION = """ 66 | @article{scikit-learn, 67 | title={Scikit-learn: Machine Learning in {P}ython}, 68 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. 69 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. 70 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and 71 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, 72 | journal={Journal of Machine Learning Research}, 73 | volume={12}, 74 | pages={2825--2830}, 75 | year={2011} 76 | } 77 | """ 78 | 79 | 80 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 81 | class Accuracy(evaluate.Metric): 82 | def _info(self): 83 | return evaluate.MetricInfo( 84 | description=_DESCRIPTION, 85 | citation=_CITATION, 86 | inputs_description=_KWARGS_DESCRIPTION, 87 | features=datasets.Features( 88 | { 89 | "predictions": datasets.Sequence(datasets.Value("int32")), 90 | "references": datasets.Sequence(datasets.Value("int32")), 91 | } 92 | if self.config_name == "multilabel" 93 | else { 94 | "predictions": datasets.Value("int32"), 95 | "references": datasets.Value("int32"), 96 | } 97 | ), 98 | reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"], 99 | ) 100 | 101 | def _compute(self, predictions, references, normalize=True, sample_weight=None): 102 | return { 103 | "accuracy": float( 104 | accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight) 105 | ) 106 | } 107 | -------------------------------------------------------------------------------- /train/sft/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | 20 | "scheduler": { 21 | "type": "WarmupDecayLR", 22 | "params": { 23 | "last_batch_iteration": -1, 24 | "total_num_steps": "auto", 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | 31 | "zero_optimization": { 32 | "stage": 2, 33 | "offload_optimizer": { 34 | "device": "cpu", 35 | "pin_memory": true 36 | }, 37 | "offload_param": { 38 | "device": "cpu", 39 | "pin_memory": true 40 | }, 41 | "allgather_partitions": true, 42 | "allgather_bucket_size": 5e8, 43 | "overlap_comm": true, 44 | "reduce_scatter": true, 45 | "reduce_bucket_size": 5e8, 46 | "contiguous_gradients": true 47 | }, 48 | "activation_checkpointing": { 49 | "partition_activations": false, 50 | "cpu_checkpointing": false, 51 | "contiguous_memory_optimization": false, 52 | "number_checkpoints": null, 53 | "synchronize_checkpoint_boundary": false, 54 | "profile": false 55 | }, 56 | "gradient_accumulation_steps": "auto", 57 | "gradient_clipping": "auto", 58 | "steps_per_print": 2000, 59 | "train_batch_size": "auto", 60 | "min_lr": 5e-7, 61 | "train_micro_batch_size_per_gpu": "auto", 62 | "wall_clock_breakdown": false 63 | } -------------------------------------------------------------------------------- /train/sft/finetune.sh: -------------------------------------------------------------------------------- 1 | output_model=save_folder 2 | # 需要修改到自己的输入目录 3 | if [ ! -d ${output_model} ];then 4 | mkdir ${output_model} 5 | fi 6 | cp ./finetune.sh ${output_model} 7 | deepspeed --include localhost:1,0 finetune_clm.py \ 8 | --model_name_or_path meta-llama/Llama-2-7b-chat-hf \ 9 | --train_files ../../data/train_sft.csv \ 10 | --validation_files ../../data/dev_sft.csv \ 11 | ../../data/dev_sft_sharegpt.csv \ 12 | --per_device_train_batch_size 1 \ 13 | --per_device_eval_batch_size 1 \ 14 | --do_train \ 15 | --do_eval \ 16 | --use_fast_tokenizer false \ 17 | --output_dir ${output_model} \ 18 | --evaluation_strategy steps \ 19 | --max_eval_samples 800 \ 20 | --learning_rate 1e-4 \ 21 | --gradient_accumulation_steps 8 \ 22 | --num_train_epochs 10 \ 23 | --warmup_steps 400 \ 24 | --logging_dir ${output_model}/logs \ 25 | --logging_strategy steps \ 26 | --logging_steps 10 \ 27 | --save_strategy steps \ 28 | --preprocessing_num_workers 10 \ 29 | --save_steps 20 \ 30 | --eval_steps 20 \ 31 | --save_total_limit 2000 \ 32 | --seed 42 \ 33 | --disable_tqdm false \ 34 | --ddp_find_unused_parameters false \ 35 | --block_size 2048 \ 36 | --report_to tensorboard \ 37 | --overwrite_output_dir \ 38 | --deepspeed ds_config_zero2.json \ 39 | --ignore_data_skip true \ 40 | --bf16 \ 41 | --gradient_checkpointing \ 42 | --bf16_full_eval \ 43 | --ddp_timeout 18000000 \ 44 | | tee -a ${output_model}/train.log 45 | 46 | 47 | 48 | # --resume_from_checkpoint ${output_model}/checkpoint-20400 \ 49 | -------------------------------------------------------------------------------- /train/sft/finetune_clm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. 18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 19 | https://huggingface.co/models?filter=text-generation 20 | """ 21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. 22 | 23 | import logging 24 | import math 25 | import os 26 | import sys 27 | import random 28 | from dataclasses import dataclass, field 29 | from itertools import chain 30 | import deepspeed 31 | from typing import Optional,List,Union 32 | 33 | import datasets 34 | import evaluate 35 | import torch 36 | from datasets import load_dataset 37 | from peft import ( # noqa: E402 38 | LoraConfig, 39 | PeftModel, 40 | get_peft_model, 41 | get_peft_model_state_dict, 42 | prepare_model_for_int8_training, 43 | prepare_model_for_kbit_training, 44 | set_peft_model_state_dict, 45 | ) 46 | import transformers 47 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 48 | from transformers import ( 49 | CONFIG_MAPPING, 50 | MODEL_FOR_CAUSAL_LM_MAPPING, 51 | AutoConfig, 52 | AutoModelForCausalLM, 53 | AutoTokenizer, 54 | TrainerCallback, 55 | TrainerState, 56 | TrainerControl, 57 | HfArgumentParser, 58 | Trainer, 59 | TrainingArguments, 60 | default_data_collator, 61 | BitsAndBytesConfig, 62 | is_torch_tpu_available, 63 | set_seed, 64 | ) 65 | from transformers.testing_utils import CaptureLogger 66 | from transformers.trainer_utils import get_last_checkpoint 67 | from transformers.utils import check_min_version, send_example_telemetry 68 | from transformers.utils.versions import require_version 69 | 70 | import pdb 71 | 72 | 73 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 74 | # check_min_version("4.27.0.dev0") 75 | 76 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") 77 | 78 | logger = logging.getLogger(__name__) 79 | 80 | 81 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) 82 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 83 | 84 | 85 | @dataclass 86 | class ModelArguments: 87 | """ 88 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 89 | """ 90 | 91 | model_name_or_path: Optional[str] = field( 92 | default=None, 93 | metadata={ 94 | "help": ( 95 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." 96 | ) 97 | }, 98 | ) 99 | model_type: Optional[str] = field( 100 | default=None, 101 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 102 | ) 103 | config_overrides: Optional[str] = field( 104 | default=None, 105 | metadata={ 106 | "help": ( 107 | "Override some existing default config settings when a model is trained from scratch. Example: " 108 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" 109 | ) 110 | }, 111 | ) 112 | config_name: Optional[str] = field( 113 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 114 | ) 115 | tokenizer_name: Optional[str] = field( 116 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 117 | ) 118 | cache_dir: Optional[str] = field( 119 | default=None, 120 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 121 | ) 122 | use_fast_tokenizer: bool = field( 123 | default=True, 124 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 125 | ) 126 | model_revision: str = field( 127 | default="main", 128 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 129 | ) 130 | use_auth_token: bool = field( 131 | default=False, 132 | metadata={ 133 | "help": ( 134 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script " 135 | "with private models)." 136 | ) 137 | }, 138 | ) 139 | 140 | torch_dtype: Optional[str] = field( 141 | default=None, 142 | metadata={ 143 | "help": ( 144 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " 145 | "dtype will be automatically derived from the model's weights." 146 | ), 147 | "choices": ["auto", "bfloat16", "float16", "float32"], 148 | }, 149 | ) 150 | 151 | def __post_init__(self): 152 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): 153 | raise ValueError( 154 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path" 155 | ) 156 | 157 | 158 | 159 | @dataclass 160 | class DataTrainingArguments: 161 | """ 162 | Arguments pertaining to what data we are going to input our model for training and eval. 163 | """ 164 | train_on_inputs: bool = field( 165 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 166 | ) 167 | dataset_name: Optional[str] = field( 168 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 169 | ) 170 | dataset_config_name: Optional[str] = field( 171 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 172 | ) 173 | train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 174 | validation_files: Optional[List[str]] = field( 175 | default=None, 176 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 177 | ) 178 | max_train_samples: Optional[int] = field( 179 | default=None, 180 | metadata={ 181 | "help": ( 182 | "For debugging purposes or quicker training, truncate the number of training examples to this " 183 | "value if set." 184 | ) 185 | }, 186 | ) 187 | max_eval_samples: Optional[int] = field( 188 | default=None, 189 | metadata={ 190 | "help": ( 191 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 192 | "value if set." 193 | ) 194 | }, 195 | ) 196 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) 197 | block_size: Optional[int] = field( 198 | default=None, 199 | metadata={ 200 | "help": ( 201 | "Optional input sequence length after tokenization. " 202 | "The training dataset will be truncated in block of this size for training. " 203 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 204 | ) 205 | }, 206 | ) 207 | overwrite_cache: bool = field( 208 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 209 | ) 210 | validation_split_percentage: Optional[int] = field( 211 | default=5, 212 | metadata={ 213 | "help": "The percentage of the train set used as validation set in case there's no validation split" 214 | }, 215 | ) 216 | preprocessing_num_workers: Optional[int] = field( 217 | default=None, 218 | metadata={"help": "The number of processes to use for the preprocessing."}, 219 | ) 220 | keep_linebreaks: bool = field( 221 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} 222 | ) 223 | 224 | def __post_init__(self): 225 | if self.streaming: 226 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") 227 | 228 | if self.dataset_name is None and self.train_files is None and self.validation_files is None: 229 | raise ValueError("Need either a dataset name or a training/validation file.") 230 | else: 231 | if self.train_files is not None: 232 | extension = self.train_files[0].split(".")[-1] 233 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 234 | if self.validation_files is not None: 235 | extension = self.validation_files[0].split(".")[-1] 236 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." 237 | 238 | def main(): 239 | # See all possible arguments in src/transformers/training_args.py 240 | # or by passing the --help flag to this script. 241 | # We now keep distinct sets of args, for a cleaner separation of concerns. 242 | 243 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 244 | # pdb.set_trace() 245 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 246 | # If we pass only one argument to the script and it's the path to a json file, 247 | # let's parse it to get our arguments. 248 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 249 | else: 250 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 251 | 252 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The 253 | # information sent is the one passed as arguments along with your Python/PyTorch versions. 254 | send_example_telemetry("run_clm", model_args, data_args) 255 | 256 | # Setup logging 257 | logging.basicConfig( 258 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 259 | datefmt="%m/%d/%Y %H:%M:%S", 260 | handlers=[logging.StreamHandler(sys.stdout)], 261 | ) 262 | 263 | if training_args.should_log: 264 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 265 | transformers.utils.logging.set_verbosity_info() 266 | 267 | log_level = training_args.get_process_log_level() 268 | logger.setLevel(log_level) 269 | datasets.utils.logging.set_verbosity(log_level) 270 | transformers.utils.logging.set_verbosity(log_level) 271 | transformers.utils.logging.enable_default_handler() 272 | transformers.utils.logging.enable_explicit_format() 273 | 274 | # Log on each process the small summary: 275 | logger.warning( 276 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 277 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 278 | ) 279 | logger.info(f"Training/evaluation parameters {training_args}") 280 | 281 | # Detecting last checkpoint. 282 | last_checkpoint = None 283 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 284 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 285 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 286 | raise ValueError( 287 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 288 | "Use --overwrite_output_dir to overcome." 289 | ) 290 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 291 | logger.info( 292 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 293 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 294 | ) 295 | 296 | # Set seed before initializing model. 297 | set_seed(training_args.seed) 298 | 299 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 300 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 301 | # (the dataset will be downloaded automatically from the datasets Hub). 302 | # 303 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 304 | # 'text' is found. You can easily tweak this behavior (see below). 305 | # 306 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 307 | # download the dataset. 308 | if True: 309 | data_files = {} 310 | dataset_args = {} 311 | if data_args.train_files is not None: 312 | data_files["train"] = data_args.train_files 313 | if data_args.validation_files is not None: 314 | data_files["validation"] = data_args.validation_files 315 | extension = ( 316 | data_args.train_files[0].split(".")[-1] 317 | if data_args.train_files is not None 318 | else data_args.validation_files.split(".")[-1] 319 | ) 320 | if extension == "txt": 321 | extension = "text" 322 | dataset_args["keep_linebreaks"] = data_args.keep_linebreaks 323 | raw_datasets = load_dataset( 324 | extension, 325 | data_files=data_files, 326 | cache_dir=os.path.join(training_args.output_dir,'dataset_cache'), 327 | use_auth_token=True if model_args.use_auth_token else None, 328 | **dataset_args, 329 | ) 330 | # If no validation data is there, validation_split_percentage will be used to divide the dataset. 331 | if "validation" not in raw_datasets.keys(): 332 | raw_datasets["validation"] = load_dataset( 333 | extension, 334 | data_files=data_files, 335 | split=f"train[:{data_args.validation_split_percentage}%]", 336 | cache_dir=model_args.cache_dir, 337 | use_auth_token=True if model_args.use_auth_token else None, 338 | **dataset_args, 339 | ) 340 | raw_datasets["train"] = load_dataset( 341 | extension, 342 | data_files=data_files, 343 | split=f"train[{data_args.validation_split_percentage}%:]", 344 | cache_dir=model_args.cache_dir, 345 | use_auth_token=True if model_args.use_auth_token else None, 346 | **dataset_args, 347 | ) 348 | 349 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 350 | # https://huggingface.co/docs/datasets/loading_datasets.html. 351 | 352 | # Load pretrained model and tokenizer 353 | # 354 | # Distributed training: 355 | # The .from_pretrained methods guarantee that only one local process can concurrently 356 | # download model & vocab. 357 | 358 | config_kwargs = { 359 | "cache_dir": model_args.cache_dir, 360 | "revision": model_args.model_revision, 361 | "use_auth_token": True if model_args.use_auth_token else None, 362 | } 363 | if model_args.config_name: 364 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 365 | elif model_args.model_name_or_path: 366 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 367 | else: 368 | config = CONFIG_MAPPING[model_args.model_type]() 369 | logger.warning("You are instantiating a new config instance from scratch.") 370 | if model_args.config_overrides is not None: 371 | logger.info(f"Overriding config: {model_args.config_overrides}") 372 | config.update_from_string(model_args.config_overrides) 373 | logger.info(f"New config: {config}") 374 | 375 | tokenizer_kwargs = { 376 | "cache_dir": model_args.cache_dir, 377 | "use_fast": model_args.use_fast_tokenizer, 378 | "revision": model_args.model_revision, 379 | "use_auth_token": True if model_args.use_auth_token else None, 380 | "padding_side":'left' 381 | } 382 | if model_args.tokenizer_name: 383 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 384 | elif model_args.model_name_or_path: 385 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 386 | else: 387 | raise ValueError( 388 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 389 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 390 | ) 391 | tokenizer.pad_token = tokenizer.eos_token 392 | if model_args.model_name_or_path: 393 | torch_dtype = ( 394 | model_args.torch_dtype 395 | if model_args.torch_dtype in ["auto", None] 396 | else getattr(torch, model_args.torch_dtype) 397 | ) 398 | print(torch_dtype) 399 | torch_dtype = torch.float16 400 | model = AutoModelForCausalLM.from_pretrained( 401 | model_args.model_name_or_path, 402 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 403 | config=config, 404 | cache_dir=model_args.cache_dir, 405 | revision=model_args.model_revision, 406 | use_auth_token=True if model_args.use_auth_token else None, 407 | torch_dtype=torch_dtype, 408 | trust_remote_code=True, 409 | use_flash_attention_2=True, 410 | device_map={"": int(os.environ.get("LOCAL_RANK") or 0)} 411 | ) 412 | # model = prepare_model_for_int8_training(model, output_embedding_layer_name="embed_out", layer_norm_names=[]) 413 | 414 | else: 415 | model = AutoModelForCausalLM.from_config(config) 416 | n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) 417 | logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") 418 | 419 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch 420 | # on a small vocab and want a smaller embedding size, remove this test. 421 | embedding_size = model.get_input_embeddings().weight.shape[0] 422 | if len(tokenizer) > embedding_size: 423 | model.resize_token_embeddings(len(tokenizer)) 424 | 425 | # Preprocessing the datasets. 426 | # First we tokenize all the texts. 427 | if training_args.do_train: 428 | column_names = list(raw_datasets["train"].features) 429 | else: 430 | column_names = list(raw_datasets["validation"].features) 431 | 432 | train_on_inputs = True 433 | if len(column_names)==1: 434 | text_column_name = "text" if "text" in column_names else column_names[0] 435 | elif len(column_names)==2: 436 | input_column_name = 'input' if 'input' in column_names else column_names[0] 437 | target_column_name = 'target' if 'target' in column_names else column_names[0] 438 | train_on_inputs=False 439 | else: 440 | raise ValueError('输入文件列数不对') 441 | print('train_on_inputs',train_on_inputs) 442 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function 443 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") 444 | 445 | def tokenize_function(examples): 446 | with CaptureLogger(tok_logger) as cl: 447 | output = tokenizer([ item for item in examples[text_column_name]],truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None) 448 | output['labels'] = output['input_ids'].copy() 449 | return output 450 | 451 | def tokenize(prompt): 452 | result = tokenizer(prompt,truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None) 453 | result["labels"] = result["input_ids"].copy() 454 | return result 455 | 456 | def generate_and_tokenize_prompt(data_point): 457 | input_text = data_point[input_column_name] 458 | target_text = data_point[target_column_name] 459 | full_prompt = input_text+target_text 460 | tokenized_full_prompt = tokenize(full_prompt) 461 | if not train_on_inputs: 462 | user_prompt = input_text 463 | tokenized_user_prompt = tokenize(user_prompt) 464 | user_prompt_len = len(tokenized_user_prompt["input_ids"]) 465 | tokenized_full_prompt["labels"] = [ 466 | -100 467 | ] * user_prompt_len + tokenized_full_prompt["labels"][ 468 | user_prompt_len: 469 | ] 470 | return tokenized_full_prompt 471 | 472 | 473 | 474 | with training_args.main_process_first(desc="dataset map tokenization"): 475 | if not data_args.streaming: 476 | tokenized_datasets = raw_datasets.map( 477 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt, 478 | batched=True if train_on_inputs==True else False, 479 | num_proc=data_args.preprocessing_num_workers, 480 | remove_columns=column_names, 481 | load_from_cache_file=not data_args.overwrite_cache, 482 | desc="Running tokenizer on dataset", 483 | ) 484 | else: 485 | tokenized_datasets = raw_datasets.map( 486 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt, 487 | batched=True if train_on_inputs==True else False, 488 | remove_columns=column_names, 489 | ) 490 | 491 | if data_args.block_size is None: 492 | block_size = tokenizer.model_max_length 493 | if block_size > 2048: 494 | block_size = 2048 495 | else: 496 | block_size = min(data_args.block_size, tokenizer.model_max_length) 497 | 498 | if training_args.do_train: 499 | if "train" not in tokenized_datasets: 500 | raise ValueError("--do_train requires a train dataset") 501 | train_dataset = tokenized_datasets["train"] 502 | if data_args.max_train_samples is not None: 503 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 504 | train_dataset = train_dataset.select(range(max_train_samples)) 505 | for index in random.sample(range(len(train_dataset)), 3): 506 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 507 | train_dataset = train_dataset.shuffle(seed=training_args.seed) 508 | 509 | if training_args.do_eval: 510 | if "validation" not in tokenized_datasets: 511 | raise ValueError("--do_eval requires a validation dataset") 512 | eval_dataset = tokenized_datasets["validation"] 513 | if data_args.max_eval_samples is not None: 514 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 515 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 516 | 517 | def preprocess_logits_for_metrics(logits, labels): 518 | if isinstance(logits, tuple): 519 | # Depending on the model and config, logits may contain extra tensors, 520 | # like past_key_values, but logits always come first 521 | logits = logits[0] 522 | return logits.argmax(dim=-1) 523 | 524 | metric = evaluate.load("accuracy.py") 525 | 526 | def compute_metrics(eval_preds): 527 | preds, labels = eval_preds 528 | # preds have the same shape as the labels, after the argmax(-1) has been calculated 529 | # by preprocess_logits_for_metrics but we need to shift the labels 530 | labels = labels[:, 1:].reshape(-1) 531 | # .reshape(-1) 532 | preds = preds[:, :-1].reshape(-1) 533 | # .reshape(-1) 534 | # print(labels.shape) 535 | # true_predictions = [ 536 | # [p for (p, l) in zip(pred, gold_label) if l != -100] 537 | # for pred, gold_label in zip(preds, labels) 538 | # ] 539 | # true_labels = [ 540 | # [l for (p, l) in zip(pred, gold_label) if l != -100] 541 | # for pred, gold_label in zip(preds, labels) 542 | # ] 543 | # preds = np.array(true_predictions).reshape(-1) 544 | # labels = np.array(true_labels).reshape(-1) 545 | return metric.compute(predictions=preds, references=labels) 546 | 547 | # Initialize our Trainer 548 | trainer = Trainer( 549 | model=model, 550 | args=training_args, 551 | train_dataset=train_dataset if training_args.do_train else None, 552 | eval_dataset=eval_dataset if training_args.do_eval else None, 553 | tokenizer=tokenizer, 554 | # Data collator will default to DataCollatorWithPadding, so we change it. 555 | data_collator=transformers.DataCollatorForSeq2Seq( 556 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 557 | ), 558 | compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, 559 | preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None, 560 | ) 561 | 562 | # Training 563 | if training_args.do_train: 564 | checkpoint = None 565 | if training_args.resume_from_checkpoint is not None: 566 | checkpoint = training_args.resume_from_checkpoint 567 | elif last_checkpoint is not None: 568 | checkpoint = last_checkpoint 569 | 570 | print(training_args.local_rank,'start train') 571 | 572 | if torch.__version__ >= "2" and sys.platform != "win32": 573 | model = torch.compile(model) 574 | 575 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 576 | trainer.save_model() # Saves the tokenizer too for easy upload 577 | 578 | metrics = train_result.metrics 579 | 580 | max_train_samples = ( 581 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 582 | ) 583 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 584 | 585 | trainer.log_metrics("train", metrics) 586 | trainer.save_metrics("train", metrics) 587 | trainer.save_state() 588 | 589 | # Evaluation 590 | if training_args.do_eval: 591 | logger.info("*** Evaluate ***") 592 | 593 | metrics = trainer.evaluate() 594 | 595 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 596 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 597 | try: 598 | perplexity = math.exp(metrics["eval_loss"]) 599 | except OverflowError: 600 | perplexity = float("inf") 601 | metrics["perplexity"] = perplexity 602 | 603 | trainer.log_metrics("eval", metrics) 604 | trainer.save_metrics("eval", metrics) 605 | 606 | 607 | 608 | def _mp_fn(index): 609 | # For xla_spawn (TPUs) 610 | main() 611 | 612 | 613 | if __name__ == "__main__": 614 | main() 615 | -------------------------------------------------------------------------------- /train/sft/finetune_clm_lora.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. 18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 19 | https://huggingface.co/models?filter=text-generation 20 | """ 21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. 22 | 23 | import logging 24 | import math 25 | import os 26 | import sys 27 | import random 28 | from dataclasses import dataclass, field 29 | from itertools import chain 30 | import deepspeed 31 | from typing import Optional,List,Union 32 | 33 | import datasets 34 | import evaluate 35 | import torch 36 | from datasets import load_dataset 37 | from peft import ( # noqa: E402 38 | LoraConfig, 39 | PeftModel, 40 | get_peft_model, 41 | get_peft_model_state_dict, 42 | prepare_model_for_int8_training, 43 | prepare_model_for_kbit_training, 44 | set_peft_model_state_dict, 45 | ) 46 | import transformers 47 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 48 | from transformers import ( 49 | CONFIG_MAPPING, 50 | MODEL_FOR_CAUSAL_LM_MAPPING, 51 | AutoConfig, 52 | AutoModelForCausalLM, 53 | AutoTokenizer, 54 | TrainerCallback, 55 | TrainerState, 56 | TrainerControl, 57 | HfArgumentParser, 58 | Trainer, 59 | TrainingArguments, 60 | default_data_collator, 61 | BitsAndBytesConfig, 62 | is_torch_tpu_available, 63 | set_seed, 64 | ) 65 | from transformers.testing_utils import CaptureLogger 66 | from transformers.trainer_utils import get_last_checkpoint 67 | from transformers.utils import check_min_version, send_example_telemetry 68 | from transformers.utils.versions import require_version 69 | 70 | import pdb 71 | 72 | 73 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 74 | # check_min_version("4.27.0.dev0") 75 | 76 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") 77 | 78 | logger = logging.getLogger(__name__) 79 | 80 | 81 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) 82 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 83 | 84 | 85 | @dataclass 86 | class ModelArguments: 87 | """ 88 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 89 | """ 90 | 91 | model_name_or_path: Optional[str] = field( 92 | default=None, 93 | metadata={ 94 | "help": ( 95 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." 96 | ) 97 | }, 98 | ) 99 | model_type: Optional[str] = field( 100 | default=None, 101 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 102 | ) 103 | config_overrides: Optional[str] = field( 104 | default=None, 105 | metadata={ 106 | "help": ( 107 | "Override some existing default config settings when a model is trained from scratch. Example: " 108 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" 109 | ) 110 | }, 111 | ) 112 | config_name: Optional[str] = field( 113 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 114 | ) 115 | tokenizer_name: Optional[str] = field( 116 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 117 | ) 118 | cache_dir: Optional[str] = field( 119 | default=None, 120 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 121 | ) 122 | lora_r: Optional[int] = field(default=16) 123 | lora_alpha: Optional[int] = field(default=32) 124 | target_modules: Optional[str] = field( 125 | default='q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj', 126 | metadata={ 127 | "help": "List of module names or regex expression of the module names to replace with Lora." 128 | "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " 129 | }, 130 | ) 131 | use_fast_tokenizer: bool = field( 132 | default=True, 133 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 134 | ) 135 | load_in_bits: Optional[int] = field(default=8) 136 | model_revision: str = field( 137 | default="main", 138 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 139 | ) 140 | use_auth_token: bool = field( 141 | default=False, 142 | metadata={ 143 | "help": ( 144 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script " 145 | "with private models)." 146 | ) 147 | }, 148 | ) 149 | 150 | torch_dtype: Optional[str] = field( 151 | default=None, 152 | metadata={ 153 | "help": ( 154 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " 155 | "dtype will be automatically derived from the model's weights." 156 | ), 157 | "choices": ["auto", "bfloat16", "float16", "float32"], 158 | }, 159 | ) 160 | 161 | def __post_init__(self): 162 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): 163 | raise ValueError( 164 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path" 165 | ) 166 | if type(self.target_modules)==str: 167 | self.target_modules = self.target_modules.split(',') 168 | 169 | 170 | @dataclass 171 | class DataTrainingArguments: 172 | """ 173 | Arguments pertaining to what data we are going to input our model for training and eval. 174 | """ 175 | train_on_inputs: bool = field( 176 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 177 | ) 178 | dataset_name: Optional[str] = field( 179 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 180 | ) 181 | dataset_config_name: Optional[str] = field( 182 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 183 | ) 184 | train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 185 | validation_files: Optional[List[str]] = field( 186 | default=None, 187 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 188 | ) 189 | max_train_samples: Optional[int] = field( 190 | default=None, 191 | metadata={ 192 | "help": ( 193 | "For debugging purposes or quicker training, truncate the number of training examples to this " 194 | "value if set." 195 | ) 196 | }, 197 | ) 198 | max_eval_samples: Optional[int] = field( 199 | default=None, 200 | metadata={ 201 | "help": ( 202 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 203 | "value if set." 204 | ) 205 | }, 206 | ) 207 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) 208 | block_size: Optional[int] = field( 209 | default=None, 210 | metadata={ 211 | "help": ( 212 | "Optional input sequence length after tokenization. " 213 | "The training dataset will be truncated in block of this size for training. " 214 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 215 | ) 216 | }, 217 | ) 218 | overwrite_cache: bool = field( 219 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 220 | ) 221 | validation_split_percentage: Optional[int] = field( 222 | default=5, 223 | metadata={ 224 | "help": "The percentage of the train set used as validation set in case there's no validation split" 225 | }, 226 | ) 227 | preprocessing_num_workers: Optional[int] = field( 228 | default=None, 229 | metadata={"help": "The number of processes to use for the preprocessing."}, 230 | ) 231 | keep_linebreaks: bool = field( 232 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} 233 | ) 234 | 235 | def __post_init__(self): 236 | if self.streaming: 237 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") 238 | 239 | if self.dataset_name is None and self.train_files is None and self.validation_files is None: 240 | raise ValueError("Need either a dataset name or a training/validation file.") 241 | else: 242 | if self.train_files is not None: 243 | extension = self.train_files[0].split(".")[-1] 244 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 245 | if self.validation_files is not None: 246 | extension = self.validation_files[0].split(".")[-1] 247 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." 248 | 249 | class SavePeftModelCallback(TrainerCallback): 250 | def on_save( 251 | self, 252 | args: TrainingArguments, 253 | state: TrainerState, 254 | control: TrainerControl, 255 | **kwargs, 256 | ): 257 | if state.is_world_process_zero: 258 | print('+++++++++++++++++save call back++++++++++++++++') 259 | checkpoint_folder = os.path.join( 260 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" 261 | ) 262 | kwargs["model"].save_pretrained(checkpoint_folder) 263 | 264 | pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin") 265 | if os.path.exists(pytorch_model_path): 266 | os.remove(pytorch_model_path) 267 | return control 268 | 269 | def main(): 270 | # See all possible arguments in src/transformers/training_args.py 271 | # or by passing the --help flag to this script. 272 | # We now keep distinct sets of args, for a cleaner separation of concerns. 273 | 274 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 275 | # pdb.set_trace() 276 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 277 | # If we pass only one argument to the script and it's the path to a json file, 278 | # let's parse it to get our arguments. 279 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 280 | else: 281 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 282 | 283 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The 284 | # information sent is the one passed as arguments along with your Python/PyTorch versions. 285 | send_example_telemetry("run_clm", model_args, data_args) 286 | 287 | # Setup logging 288 | logging.basicConfig( 289 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 290 | datefmt="%m/%d/%Y %H:%M:%S", 291 | handlers=[logging.StreamHandler(sys.stdout)], 292 | ) 293 | 294 | if training_args.should_log: 295 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 296 | transformers.utils.logging.set_verbosity_info() 297 | 298 | log_level = training_args.get_process_log_level() 299 | logger.setLevel(log_level) 300 | datasets.utils.logging.set_verbosity(log_level) 301 | transformers.utils.logging.set_verbosity(log_level) 302 | transformers.utils.logging.enable_default_handler() 303 | transformers.utils.logging.enable_explicit_format() 304 | 305 | # Log on each process the small summary: 306 | logger.warning( 307 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 308 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 309 | ) 310 | logger.info(f"Training/evaluation parameters {training_args}") 311 | 312 | # Detecting last checkpoint. 313 | last_checkpoint = None 314 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 315 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 316 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 317 | raise ValueError( 318 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 319 | "Use --overwrite_output_dir to overcome." 320 | ) 321 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 322 | logger.info( 323 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 324 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 325 | ) 326 | 327 | # Set seed before initializing model. 328 | set_seed(training_args.seed) 329 | 330 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 331 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 332 | # (the dataset will be downloaded automatically from the datasets Hub). 333 | # 334 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 335 | # 'text' is found. You can easily tweak this behavior (see below). 336 | # 337 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 338 | # download the dataset. 339 | if True: 340 | data_files = {} 341 | dataset_args = {} 342 | if data_args.train_files is not None: 343 | data_files["train"] = data_args.train_files 344 | if data_args.validation_files is not None: 345 | data_files["validation"] = data_args.validation_files 346 | extension = ( 347 | data_args.train_files[0].split(".")[-1] 348 | if data_args.train_files is not None 349 | else data_args.validation_files.split(".")[-1] 350 | ) 351 | if extension == "txt": 352 | extension = "text" 353 | dataset_args["keep_linebreaks"] = data_args.keep_linebreaks 354 | raw_datasets = load_dataset( 355 | extension, 356 | data_files=data_files, 357 | cache_dir=os.path.join(training_args.output_dir,'dataset_cache'), 358 | use_auth_token=True if model_args.use_auth_token else None, 359 | **dataset_args, 360 | ) 361 | # If no validation data is there, validation_split_percentage will be used to divide the dataset. 362 | if "validation" not in raw_datasets.keys(): 363 | raw_datasets["validation"] = load_dataset( 364 | extension, 365 | data_files=data_files, 366 | split=f"train[:{data_args.validation_split_percentage}%]", 367 | cache_dir=model_args.cache_dir, 368 | use_auth_token=True if model_args.use_auth_token else None, 369 | **dataset_args, 370 | ) 371 | raw_datasets["train"] = load_dataset( 372 | extension, 373 | data_files=data_files, 374 | split=f"train[{data_args.validation_split_percentage}%:]", 375 | cache_dir=model_args.cache_dir, 376 | use_auth_token=True if model_args.use_auth_token else None, 377 | **dataset_args, 378 | ) 379 | 380 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 381 | # https://huggingface.co/docs/datasets/loading_datasets.html. 382 | 383 | # Load pretrained model and tokenizer 384 | # 385 | # Distributed training: 386 | # The .from_pretrained methods guarantee that only one local process can concurrently 387 | # download model & vocab. 388 | 389 | config_kwargs = { 390 | "cache_dir": model_args.cache_dir, 391 | "revision": model_args.model_revision, 392 | "use_auth_token": True if model_args.use_auth_token else None, 393 | } 394 | if model_args.config_name: 395 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 396 | elif model_args.model_name_or_path: 397 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 398 | else: 399 | config = CONFIG_MAPPING[model_args.model_type]() 400 | logger.warning("You are instantiating a new config instance from scratch.") 401 | if model_args.config_overrides is not None: 402 | logger.info(f"Overriding config: {model_args.config_overrides}") 403 | config.update_from_string(model_args.config_overrides) 404 | logger.info(f"New config: {config}") 405 | 406 | tokenizer_kwargs = { 407 | "cache_dir": model_args.cache_dir, 408 | "use_fast": model_args.use_fast_tokenizer, 409 | "revision": model_args.model_revision, 410 | "use_auth_token": True if model_args.use_auth_token else None, 411 | "padding_side":'left' 412 | } 413 | if model_args.tokenizer_name: 414 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 415 | elif model_args.model_name_or_path: 416 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 417 | else: 418 | raise ValueError( 419 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 420 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 421 | ) 422 | tokenizer.pad_token = tokenizer.eos_token 423 | lora_config = LoraConfig( 424 | r=model_args.lora_r, 425 | lora_alpha=model_args.lora_alpha, 426 | # target_modules=["query_key_value"], 427 | # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'], 428 | target_modules = model_args.target_modules, 429 | fan_in_fan_out = False, 430 | lora_dropout=0.05, 431 | inference_mode=False, 432 | bias="none", 433 | task_type="CAUSAL_LM", 434 | ) 435 | print(lora_config) 436 | bnb_config = BitsAndBytesConfig( 437 | load_in_4bit=True, 438 | bnb_4bit_use_double_quant=True, 439 | bnb_4bit_quant_type="nf4", 440 | bnb_4bit_compute_dtype=torch.bfloat16 441 | ) 442 | if model_args.model_name_or_path: 443 | torch_dtype = ( 444 | model_args.torch_dtype 445 | if model_args.torch_dtype in ["auto", None] 446 | else getattr(torch, model_args.torch_dtype) 447 | ) 448 | print(torch_dtype) 449 | torch_dtype = torch.float16 450 | model = AutoModelForCausalLM.from_pretrained( 451 | model_args.model_name_or_path, 452 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 453 | config=config, 454 | cache_dir=model_args.cache_dir, 455 | revision=model_args.model_revision, 456 | use_auth_token=True if model_args.use_auth_token else None, 457 | torch_dtype=torch_dtype, 458 | load_in_8bit=True if model_args.load_in_bits==8 else False, 459 | trust_remote_code=True, 460 | use_flash_attention_2=True, 461 | quantization_config=bnb_config if model_args.load_in_bits==4 else None, 462 | # device_map = 'auto' 463 | device_map={"": int(os.environ.get("LOCAL_RANK") or 0)} 464 | ) 465 | # model = prepare_model_for_int8_training(model, output_embedding_layer_name="embed_out", layer_norm_names=[]) 466 | 467 | else: 468 | model = AutoModelForCausalLM.from_config(config) 469 | n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) 470 | logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") 471 | 472 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch 473 | # on a small vocab and want a smaller embedding size, remove this test. 474 | embedding_size = model.get_input_embeddings().weight.shape[0] 475 | if len(tokenizer) > embedding_size: 476 | model.resize_token_embeddings(len(tokenizer)) 477 | if model_args.load_in_bits==8: 478 | model = prepare_model_for_int8_training(model) 479 | elif model_args.load_in_bits==4: 480 | model = prepare_model_for_kbit_training(model) 481 | 482 | # Preprocessing the datasets. 483 | # First we tokenize all the texts. 484 | if training_args.do_train: 485 | column_names = list(raw_datasets["train"].features) 486 | else: 487 | column_names = list(raw_datasets["validation"].features) 488 | 489 | train_on_inputs = True 490 | if len(column_names)==1: 491 | text_column_name = "text" if "text" in column_names else column_names[0] 492 | elif len(column_names)==2: 493 | input_column_name = 'input' if 'input' in column_names else column_names[0] 494 | target_column_name = 'target' if 'target' in column_names else column_names[0] 495 | train_on_inputs=False 496 | else: 497 | raise ValueError('输入文件列数不对') 498 | print('train_on_inputs',train_on_inputs) 499 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function 500 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") 501 | 502 | def tokenize_function(examples): 503 | with CaptureLogger(tok_logger) as cl: 504 | output = tokenizer([ item for item in examples[text_column_name]],truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None) 505 | output['labels'] = output['input_ids'].copy() 506 | return output 507 | 508 | def tokenize(prompt): 509 | result = tokenizer(prompt,truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None) 510 | result["labels"] = result["input_ids"].copy() 511 | return result 512 | 513 | def generate_and_tokenize_prompt(data_point): 514 | input_text = data_point[input_column_name] 515 | target_text = data_point[target_column_name] 516 | full_prompt = input_text+target_text 517 | tokenized_full_prompt = tokenize(full_prompt) 518 | if not train_on_inputs: 519 | user_prompt = input_text 520 | tokenized_user_prompt = tokenize(user_prompt) 521 | user_prompt_len = len(tokenized_user_prompt["input_ids"]) 522 | tokenized_full_prompt["labels"] = [ 523 | -100 524 | ] * user_prompt_len + tokenized_full_prompt["labels"][ 525 | user_prompt_len: 526 | ] 527 | return tokenized_full_prompt 528 | 529 | 530 | 531 | with training_args.main_process_first(desc="dataset map tokenization"): 532 | if not data_args.streaming: 533 | tokenized_datasets = raw_datasets.map( 534 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt, 535 | batched=True if train_on_inputs==True else False, 536 | num_proc=data_args.preprocessing_num_workers, 537 | remove_columns=column_names, 538 | load_from_cache_file=not data_args.overwrite_cache, 539 | desc="Running tokenizer on dataset", 540 | ) 541 | else: 542 | tokenized_datasets = raw_datasets.map( 543 | tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt, 544 | batched=True if train_on_inputs==True else False, 545 | remove_columns=column_names, 546 | ) 547 | 548 | if data_args.block_size is None: 549 | block_size = tokenizer.model_max_length 550 | if block_size > 2048: 551 | block_size = 2048 552 | else: 553 | block_size = min(data_args.block_size, tokenizer.model_max_length) 554 | 555 | if training_args.do_train: 556 | if "train" not in tokenized_datasets: 557 | raise ValueError("--do_train requires a train dataset") 558 | train_dataset = tokenized_datasets["train"] 559 | if data_args.max_train_samples is not None: 560 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 561 | train_dataset = train_dataset.select(range(max_train_samples)) 562 | for index in random.sample(range(len(train_dataset)), 3): 563 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 564 | train_dataset = train_dataset.shuffle(seed=training_args.seed) 565 | 566 | if training_args.do_eval: 567 | if "validation" not in tokenized_datasets: 568 | raise ValueError("--do_eval requires a validation dataset") 569 | eval_dataset = tokenized_datasets["validation"] 570 | if data_args.max_eval_samples is not None: 571 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 572 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 573 | 574 | def preprocess_logits_for_metrics(logits, labels): 575 | if isinstance(logits, tuple): 576 | # Depending on the model and config, logits may contain extra tensors, 577 | # like past_key_values, but logits always come first 578 | logits = logits[0] 579 | return logits.argmax(dim=-1) 580 | 581 | metric = evaluate.load("accuracy.py") 582 | 583 | def compute_metrics(eval_preds): 584 | preds, labels = eval_preds 585 | # preds have the same shape as the labels, after the argmax(-1) has been calculated 586 | # by preprocess_logits_for_metrics but we need to shift the labels 587 | labels = labels[:, 1:].reshape(-1) 588 | # .reshape(-1) 589 | preds = preds[:, :-1].reshape(-1) 590 | # .reshape(-1) 591 | # print(labels.shape) 592 | # true_predictions = [ 593 | # [p for (p, l) in zip(pred, gold_label) if l != -100] 594 | # for pred, gold_label in zip(preds, labels) 595 | # ] 596 | # true_labels = [ 597 | # [l for (p, l) in zip(pred, gold_label) if l != -100] 598 | # for pred, gold_label in zip(preds, labels) 599 | # ] 600 | # preds = np.array(true_predictions).reshape(-1) 601 | # labels = np.array(true_labels).reshape(-1) 602 | return metric.compute(predictions=preds, references=labels) 603 | # layer_norm_names=[] 604 | 605 | 606 | 607 | model = get_peft_model(model, lora_config) 608 | model.print_trainable_parameters() 609 | 610 | # Initialize our Trainer 611 | trainer = Trainer( 612 | model=model, 613 | args=training_args, 614 | train_dataset=train_dataset if training_args.do_train else None, 615 | eval_dataset=eval_dataset if training_args.do_eval else None, 616 | tokenizer=tokenizer, 617 | # Data collator will default to DataCollatorWithPadding, so we change it. 618 | data_collator=transformers.DataCollatorForSeq2Seq( 619 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 620 | ), 621 | compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, 622 | preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None, 623 | callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None), 624 | ) 625 | 626 | # Training 627 | if training_args.do_train: 628 | checkpoint = None 629 | '''if training_args.resume_from_checkpoint is not None: 630 | resume_from_checkpoint = training_args.resume_from_checkpoint 631 | checkpoint_name = os.path.join(resume_from_checkpoint, "pytorch_model.bin") 632 | if not os.path.exists(checkpoint_name): 633 | checkpoint_name = os.path.join( 634 | resume_from_checkpoint, "adapter_model.bin" 635 | ) # only LoRA model - LoRA config above has to fit 636 | resume_from_checkpoint = ( 637 | False # So the trainer won't try loading its state 638 | ) 639 | # The two files above have a different name depending on how they were saved, but are actually the same. 640 | if os.path.exists(checkpoint_name): 641 | print(f"Restarting from {checkpoint_name}") 642 | adapters_weights = torch.load(checkpoint_name) 643 | set_peft_model_state_dict(model, adapters_weights) 644 | else: 645 | print(f"Checkpoint {checkpoint_name} not found") 646 | # checkpoint = Fa''' 647 | if training_args.resume_from_checkpoint is not None: 648 | checkpoint = training_args.resume_from_checkpoint 649 | elif last_checkpoint is not None: 650 | checkpoint = last_checkpoint 651 | 652 | if torch.__version__ >= "2" and sys.platform != "win32": 653 | model = torch.compile(model) 654 | 655 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 656 | trainer.save_model() # Saves the tokenizer too for easy upload 657 | 658 | metrics = train_result.metrics 659 | 660 | max_train_samples = ( 661 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 662 | ) 663 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 664 | 665 | trainer.log_metrics("train", metrics) 666 | trainer.save_metrics("train", metrics) 667 | trainer.save_state() 668 | 669 | # Evaluation 670 | if training_args.do_eval: 671 | logger.info("*** Evaluate ***") 672 | 673 | metrics = trainer.evaluate() 674 | 675 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 676 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 677 | try: 678 | perplexity = math.exp(metrics["eval_loss"]) 679 | except OverflowError: 680 | perplexity = float("inf") 681 | metrics["perplexity"] = perplexity 682 | 683 | trainer.log_metrics("eval", metrics) 684 | trainer.save_metrics("eval", metrics) 685 | 686 | 687 | 688 | def _mp_fn(index): 689 | # For xla_spawn (TPUs) 690 | main() 691 | 692 | 693 | if __name__ == "__main__": 694 | main() 695 | -------------------------------------------------------------------------------- /train/sft/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | output_model=save_folder 2 | # 需要修改到自己的输入目录 3 | if [ ! -d ${output_model} ];then 4 | mkdir ${output_model} 5 | fi 6 | export CUDA_HOME=/usr/local/cuda/ 7 | export NCCL_P2P_DISABLE=1 8 | cp ./finetune.sh ${output_model} 9 | deepspeed --include localhost:1,0 finetune_clm_lora.py \ 10 | --model_name_or_path meta-llama/Llama-2-7b-chat-hf \ 11 | --train_files ../../data/train_sft.csv \ 12 | --validation_files ../../data/dev_sft.csv \ 13 | ../../data/dev_sft_sharegpt.csv \ 14 | --per_device_train_batch_size 1 \ 15 | --per_device_eval_batch_size 1 \ 16 | --do_train \ 17 | --do_eval \ 18 | --use_fast_tokenizer false \ 19 | --output_dir ${output_model} \ 20 | --evaluation_strategy steps \ 21 | --max_eval_samples 800 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 8 \ 24 | --num_train_epochs 10 \ 25 | --warmup_steps 400 \ 26 | --load_in_bits 4 \ 27 | --lora_r 8 \ 28 | --lora_alpha 32 \ 29 | --target_modules q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj \ 30 | --logging_dir ${output_model}/logs \ 31 | --logging_strategy steps \ 32 | --logging_steps 10 \ 33 | --save_strategy steps \ 34 | --preprocessing_num_workers 10 \ 35 | --save_steps 20 \ 36 | --eval_steps 20 \ 37 | --save_total_limit 2000 \ 38 | --seed 42 \ 39 | --disable_tqdm false \ 40 | --ddp_find_unused_parameters false \ 41 | --block_size 2048 \ 42 | --report_to tensorboard \ 43 | --overwrite_output_dir \ 44 | --deepspeed ds_config_zero2.json \ 45 | --ignore_data_skip true \ 46 | --bf16 \ 47 | --gradient_checkpointing \ 48 | --bf16_full_eval \ 49 | --ddp_timeout 18000000 \ 50 | | tee -a ${output_model}/train.log 51 | 52 | 53 | 54 | # --resume_from_checkpoint ${output_model}/checkpoint-20400 \ 55 | --------------------------------------------------------------------------------