├── README.md
├── README_EN.md
├── assets
    ├── Llama4-Maverick.png
    ├── base_eval.png
    ├── ceval.jpg
    ├── llama.jpg
    ├── llama.png
    ├── llama2-chinese-webui.jpg
    ├── llama3_eval.png
    ├── llama_eval.jpeg
    ├── meta_eval_13B.md
    ├── meta_eval_7B.md
    ├── tuned_eval.png
    ├── wechat-new.jpeg
    └── wechat.jpeg
├── data
    ├── dev_sft.csv
    ├── dev_sft_sharegpt.csv
    └── train_sft.csv
├── docker
    ├── Dockerfile
    ├── Dockerfile_train
    └── docker-compose.yml
├── docs
    ├── chat_gradio_guide.md
    └── inference_speed_guide.md
├── examples
    ├── chat_gradio.py
    ├── chat_gradio_no_merge.py
    └── llama2_for_langchain.py
├── inference-speed
    ├── CPU
    │   └── ggml
    │   │   └── README.md
    └── GPU
    │   ├── FasterTransformer_example
    │       └── README.md
    │   ├── JittorLLMs_example
    │       └── README.md
    │   ├── TensorRT-LLM_example
    │       ├── README.md
    │       ├── atom_inference.py
    │       └── utils.py
    │   ├── lmdeploy_example
    │       ├── README.md
    │       └── test_api_server.py
    │   └── vllm_example
    │       ├── README.md
    │       ├── api_server.py
    │       ├── client_test.py
    │       ├── multi_gpus_api_server.sh
    │       └── single_gpu_api_server.sh
├── requirements.txt
├── scripts
    ├── api
    │   ├── README.md
    │   ├── accelerate_client.py
    │   └── accelerate_server.py
    ├── convert2hf
    │   ├── README.md
    │   └── convert_llama_weights_to_hf.py
    └── test_model
    │   └── test_pretrain_model.ipynb
└── train
    ├── merge_peft_model
        ├── merge.sh
        ├── merge_muilt.sh
        ├── merge_muilt_peft_adapter.py
        └── merge_peft_adapter.py
    ├── pretrain
        ├── accuracy.py
        ├── ds_config_zero2.json
        ├── ds_config_zero3.json
        ├── pretrain.sh
        └── pretrain_clm.py
    └── sft
        ├── accuracy.py
        ├── ds_config_zero2.json
        ├── finetune.sh
        ├── finetune_clm.py
        ├── finetune_clm_lora.py
        └── finetune_lora.sh


/assets/Llama4-Maverick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/Llama4-Maverick.png


--------------------------------------------------------------------------------
/assets/base_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/base_eval.png


--------------------------------------------------------------------------------
/assets/ceval.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/ceval.jpg


--------------------------------------------------------------------------------
/assets/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama.jpg


--------------------------------------------------------------------------------
/assets/llama.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama.png


--------------------------------------------------------------------------------
/assets/llama2-chinese-webui.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama2-chinese-webui.jpg


--------------------------------------------------------------------------------
/assets/llama3_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama3_eval.png


--------------------------------------------------------------------------------
/assets/llama_eval.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/llama_eval.jpeg


--------------------------------------------------------------------------------
/assets/tuned_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/tuned_eval.png


--------------------------------------------------------------------------------
/assets/wechat-new.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/wechat-new.jpeg


--------------------------------------------------------------------------------
/assets/wechat.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaFamily/Llama-Chinese/6fa0fffb0dd82fe3cfaa1449ee54a5806d26ae9b/assets/wechat.jpeg


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # 使用pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel作为基础镜像
 2 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel
 3 | 
 4 | RUN apt-get update -y --allow-unauthenticated 
 5 | RUN apt install -y git vim git-lfs
 6 | 
 7 | #设置工作目录
 8 | WORKDIR /root/Llama-Chinese
 9 | 
10 | # 从git上克隆llama-chinese仓库
11 | RUN git clone https://github.com/LlamaFamily/Llama-Chinese.git /root/Llama-Chinese 
12 | 
13 | # tsinghua source
14 | RUN mkdir -p ~/.pip
15 | RUN echo "[global]\nindex-url = https://pypi.tuna.tsinghua.edu.cn/simple" > ~/.pip/pip.conf
16 | 
17 | # 使用pip安装requirements.txt
18 | RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn -r requirements.txt
19 | 
20 | #克隆Hugging Face仓库
21 | RUN  git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat
22 | 
23 | #开启7860端口
24 | EXPOSE 7860
25 | 
26 | #设置启动命令
27 | ENTRYPOINT ["python", "examples/chat_gradio.py", "--model_name_or_path", "/root/Llama-Chinese/Atom-7B-Chat/"]
28 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_train:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel  as builder
2 | RUN apt-get update -y --allow-unauthenticated 
3 | RUN apt install git tmux htop vim -y 
4 | RUN pip install bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
5 | RUN pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
6 | RUN pip install peft -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
7 | RUN pip install accelerate -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
8 | RUN pip install deepspeed -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
9 | RUN pip install scipy sentencepiece datasets joblib sentence_transformers cn2an evaluate tensorboard wandb -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |   app:
 4 |     image: flagalpha/llama2-chinese:gradio # 这里替换为你实际的镜像名
 5 |     volumes:
 6 |       - /usr/local/nvidia:/usr/local/nvidia # 让容器访问主机的NVIDIA驱动
 7 |     environment:
 8 |       - NVIDIA_VISIBLE_DEVICES=all # 让容器可以访问所有的NVIDIA GPU
 9 |     ports:
10 |       - 7860:7860 # 在容器和主机之间映射端口
11 |     deploy:
12 |       resources:
13 |         reservations:
14 |           devices:
15 |             - driver: nvidia
16 |               capabilities: [gpu] # 使用Docker的设备请求来让容器使用GPU
17 | 


--------------------------------------------------------------------------------
/docs/chat_gradio_guide.md:
--------------------------------------------------------------------------------
 1 | #  Docker环境执行chat_gradio.py
 2 | 
 3 | 系统需要准备的环境
 4 | 
 5 | + docker： 24.0.2
 6 | + docker-compose
 7 | 
 8 | ## 第一步. 准备Docker镜像
 9 | 
10 | 通过docker镜像可以更方便的管理需要安装的环境依赖。所以这里可以直接通过docker容器启动[chat_gradio](../examples/chat_gradio.py)， 第一步准备镜像环境。
11 | 
12 | ```bash
13 | git clone https://github.com/LlamaFamily/Llama-Chinese.git
14 | 
15 | cd Llama-Chinese
16 | 
17 | docker build -f docker/Dockerfile  -t FlagAlpha/llama2-chinese:gradio .
18 | ```
19 | 
20 | ## 第二步. 通过docker-compose启动chat_gradio
21 | 
22 | 
23 | ```bash
24 | cd Llama-Chinese/docker
25 | doker-compose up -d --build
26 | ```


--------------------------------------------------------------------------------
/docs/inference_speed_guide.md:
--------------------------------------------------------------------------------
 1 | # 推理部署
 2 | 
 3 | > 训练完之后或者经过微调之后的模型或者直接从[huggingface](https://huggingface.co/FlagAlpha)下载的模型，都需要部署使用。部署也就是指的模型推理，如果直接使用原生的trainsfomers进行部署，速度会比较慢。针对推理有多种加速手段，会带来较快的推理速度。
 4 | 
 5 | 
 6 | 
 7 | ## 1. GPU推理方案
 8 | 
 9 | ### 方案一：vllm
10 | 
11 | [使用说明](../inference-speed/GPU/vllm_example/README.md)
12 | 
13 | ### 方案二：TensorRT-LLM
14 | 
15 | [使用说明](../inference-speed/GPU/TensorRT-LLM_example/README.md)
16 | 
17 | 
18 | ## 2. CPU 推理方案
19 | 
20 | ### 方案一：ggml
21 | [使用说明](../inference-speed/CPU/ggml/README.md)
22 | 


--------------------------------------------------------------------------------
/examples/chat_gradio.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import time
  3 | from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer
  4 | from threading import Thread
  5 | import torch,sys,os
  6 | import json
  7 | import pandas 
  8 | import argparse
  9 | 
 10 | with gr.Blocks() as demo:
 11 |     gr.Markdown("""<h1><center>智能助手</center></h1>""")
 12 |     chatbot = gr.Chatbot()
 13 |     msg = gr.Textbox()
 14 |     state = gr.State()
 15 |     with gr.Row():
 16 |         clear = gr.Button("新话题")
 17 |         re_generate = gr.Button("重新回答")
 18 |         sent_bt = gr.Button("发送")
 19 |     with gr.Accordion("生成参数", open=False):
 20 |         slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3)
 21 |         slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95)
 22 |         slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0)
 23 |     def user(user_message, history):
 24 |         return "", history + [[user_message, None]]
 25 |     def bot(history,temperature,top_p,slider_context_times):
 26 |         if pandas.isnull(history[-1][1])==False:
 27 |             history[-1][1] = None
 28 |             yield history
 29 |         slider_context_times = int(slider_context_times)
 30 |         history_true = history[1:-1]
 31 |         prompt = ''
 32 |         if slider_context_times>0:
 33 |             prompt += '\n'.join([("<s>Human: "+one_chat[0].replace('<br>','\n')+'\n</s>' if one_chat[0] else '')  +"<s>Assistant: "+one_chat[1].replace('<br>','\n')+'\n</s>'    for one_chat in history_true[-slider_context_times:] ])
 34 |         prompt +=  "<s>Human: "+history[-1][0].replace('<br>','\n')+"\n</s><s>Assistant:"
 35 |         input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda')        
 36 |         generate_input = {
 37 |             "input_ids":input_ids,
 38 |             "max_new_tokens":512,
 39 |             "do_sample":True,
 40 |             "top_k":50,
 41 |             "top_p":top_p,
 42 |             "temperature":temperature,
 43 |             "repetition_penalty":1.3,
 44 |             "streamer":streamer,
 45 |             "eos_token_id":tokenizer.eos_token_id,
 46 |             "bos_token_id":tokenizer.bos_token_id,
 47 |             "pad_token_id":tokenizer.pad_token_id
 48 |         }
 49 |         thread = Thread(target=model.generate, kwargs=generate_input)
 50 |         thread.start()
 51 |         start_time = time.time()
 52 |         bot_message =''
 53 |         print('Human:',history[-1][0])
 54 |         print('Assistant: ',end='',flush=True)
 55 |         for new_text in streamer:
 56 |             print(new_text,end='',flush=True)
 57 |             if len(new_text)==0:
 58 |                 continue
 59 |             if new_text!='</s>':
 60 |                 bot_message+=new_text
 61 |             if 'Human:' in bot_message:
 62 |                 bot_message = bot_message.split('Human:')[0]
 63 |             history[-1][1] = bot_message
 64 |             yield history
 65 |         end_time =time.time()
 66 |         print()
 67 |         print('生成耗时：',end_time-start_time,'文字长度：',len(bot_message),'字耗时：',(end_time-start_time)/len(bot_message))
 68 | 
 69 |     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
 70 |         bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
 71 |     )
 72 |     sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
 73 |         bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
 74 |     )
 75 |     re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot )
 76 |     clear.click(lambda: [], None, chatbot, queue=False)
 77 | 
 78 | if __name__ == "__main__":
 79 |     parser = argparse.ArgumentParser()
 80 |     parser.add_argument("--model_name_or_path", type=str, help='mode name or path')
 81 |     parser.add_argument("--is_4bit", action='store_true', help='use 4bit model')
 82 |     args = parser.parse_args()
 83 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False)
 84 |     tokenizer.pad_token = tokenizer.eos_token
 85 |     if args.is_4bit==False:
 86 |         model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,
 87 |                                                      device_map='cuda:0' if torch.cuda.is_available() else "auto",
 88 |                                                      torch_dtype=torch.float16,
 89 |                                                      load_in_8bit=True,
 90 |                                                      trust_remote_code=True,
 91 |                                                      use_flash_attention_2=True)
 92 |         model.eval()
 93 |     else:
 94 |         from auto_gptq import AutoGPTQForCausalLM
 95 |         model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
 96 |     streamer = TextIteratorStreamer(tokenizer,skip_prompt=True)
 97 |     if torch.__version__ >= "2" and sys.platform != "win32":
 98 |         model = torch.compile(model)
 99 |     demo.queue().launch(share=False, debug=True,server_name="0.0.0.0")
100 | 


--------------------------------------------------------------------------------
/examples/chat_gradio_no_merge.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import time
  3 | from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer
  4 | from threading import Thread
  5 | from peft import PeftModel,PeftConfig
  6 | import torch,sys,os
  7 | import json
  8 | import pandas 
  9 | import argparse
 10 | 
 11 | with gr.Blocks() as demo:
 12 |     gr.Markdown("""<h1><center>智能助手</center></h1>""")
 13 |     chatbot = gr.Chatbot()
 14 |     msg = gr.Textbox()
 15 |     state = gr.State()
 16 |     with gr.Row():
 17 |         clear = gr.Button("新话题")
 18 |         re_generate = gr.Button("重新回答")
 19 |         sent_bt = gr.Button("发送")
 20 |     with gr.Accordion("生成参数", open=False):
 21 |         slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3)
 22 |         slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95)
 23 |         slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0)
 24 |     def user(user_message, history):
 25 |         return "", history + [[user_message, None]]
 26 |     def bot(history,temperature,top_p,slider_context_times):
 27 |         if pandas.isnull(history[-1][1])==False:
 28 |             history[-1][1] = None
 29 |             yield history
 30 |         slider_context_times = int(slider_context_times)
 31 |         history_true = history[1:-1]
 32 |         prompt = ''
 33 |         if slider_context_times>0:
 34 |             prompt += '\n'.join([("<s>Human: "+one_chat[0].replace('<br>','\n')+'\n</s>' if one_chat[0] else '')  +"<s>Assistant: "+one_chat[1].replace('<br>','\n')+'\n</s>'    for one_chat in history_true[-slider_context_times:] ])
 35 |         prompt +=  "<s>Human: "+history[-1][0].replace('<br>','\n')+"\n</s><s>Assistant:"
 36 |         input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda')        
 37 |         generate_input = {
 38 |             "input_ids":input_ids,
 39 |             "max_new_tokens":512,
 40 |             "do_sample":True,
 41 |             "top_k":50,
 42 |             "top_p":top_p,
 43 |             "temperature":temperature,
 44 |             "repetition_penalty":1.3,
 45 |             "streamer":streamer,
 46 |             "eos_token_id":tokenizer.eos_token_id,
 47 |             "bos_token_id":tokenizer.bos_token_id,
 48 |             "pad_token_id":tokenizer.pad_token_id
 49 |         }
 50 |         thread = Thread(target=model.generate, kwargs=generate_input)
 51 |         thread.start()
 52 |         start_time = time.time()
 53 |         bot_message =''
 54 |         print('Human:',history[-1][0])
 55 |         print('Assistant: ',end='',flush=True)
 56 |         for new_text in streamer:
 57 |             print(new_text,end='',flush=True)
 58 |             if len(new_text)==0:
 59 |                 continue
 60 |             if new_text!='</s>':
 61 |                 bot_message+=new_text
 62 |             if 'Human:' in bot_message:
 63 |                 bot_message = bot_message.split('Human:')[0]
 64 |             history[-1][1] = bot_message
 65 |             yield history
 66 |         end_time =time.time()
 67 |         print()
 68 |         print('生成耗时：',end_time-start_time,'文字长度：',len(bot_message),'字耗时：',(end_time-start_time)/len(bot_message))
 69 | 
 70 |     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
 71 |         bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
 72 |     )
 73 |     sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
 74 |         bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
 75 |     )
 76 |     re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot )
 77 |     clear.click(lambda: [], None, chatbot, queue=False)
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument("--model_name_or_path", type=str, help='mode name or path')
 82 |     parser.add_argument("--is_4bit", action='store_true', help='use 4bit model')
 83 |     args = parser.parse_args()
 84 |     # tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False)
 85 |     # tokenizer.pad_token = tokenizer.eos_token
 86 |     if args.is_4bit==False:
 87 |         config = PeftConfig.from_pretrained(args.model_name_or_path)
 88 |         tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,use_fast=False)
 89 |         tokenizer.pad_token = tokenizer.eos_token
 90 |         model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
 91 |                                                      device_map='cuda:0' if torch.cuda.is_available() else "auto",
 92 |                                                      torch_dtype=torch.float16,
 93 |                                                      load_in_8bit=True,
 94 |                                                      low_cpu_mem_usage=True,
 95 |                                                      trust_remote_code=True,
 96 |                                                      use_flash_attention_2=True)
 97 |         model = PeftModel.from_pretrained(model, args.model_name_or_path, device_map={"": 0})
 98 |         model.eval()
 99 |     else:
100 |         from auto_gptq import AutoGPTQForCausalLM
101 |         model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
102 |     streamer = TextIteratorStreamer(tokenizer,skip_prompt=True)
103 |     if torch.__version__ >= "2" and sys.platform != "win32":
104 |         model = torch.compile(model)
105 |     demo.queue().launch(share=False, debug=True,server_name="0.0.0.0")
106 | 


--------------------------------------------------------------------------------
/examples/llama2_for_langchain.py:
--------------------------------------------------------------------------------
 1 | from langchain.llms.base import LLM
 2 | from typing import Dict, List, Any, Optional
 3 | import torch,sys,os
 4 | from transformers import AutoTokenizer
 5 | 
 6 | 
 7 | class Llama2(LLM):
 8 |     max_token: int = 2048
 9 |     temperature: float = 0.1
10 |     top_p: float = 0.95
11 |     tokenizer: Any
12 |     model: Any
13 |     
14 |     def __init__(self, model_name_or_path, bit4=False):
15 |         super().__init__()
16 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=False)
17 |         self.tokenizer.pad_token = self.tokenizer.eos_token
18 |         if bit4==False:
19 |             from transformers import AutoModelForCausalLM
20 |             device_map = "cuda:0" if torch.cuda.is_available() else "auto"
21 |             self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
22 |             self.model.eval()
23 |         else:
24 |             from auto_gptq import AutoGPTQForCausalLM
25 |             self.model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
26 |             
27 |         if torch.__version__ >= "2" and sys.platform != "win32":
28 |             self.model = torch.compile(self.model)
29 |             
30 |     @property
31 |     def _llm_type(self) -> str:
32 |         return "Llama2"
33 | 
34 |     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
35 |         print('prompt:',prompt)
36 |         input_ids = self.tokenizer(prompt, return_tensors="pt",add_special_tokens=False).input_ids.to('cuda')
37 |         generate_input = {
38 |             "input_ids":input_ids,
39 |             "max_new_tokens":1024,
40 |             "do_sample":True,
41 |             "top_k":50,
42 |             "top_p":self.top_p,
43 |             "temperature":self.temperature,
44 |             "repetition_penalty":1.2,
45 |             "eos_token_id":self.tokenizer.eos_token_id,
46 |             "bos_token_id":self.tokenizer.bos_token_id,
47 |             "pad_token_id":self.tokenizer.pad_token_id
48 |         }
49 |         generate_ids = self.model.generate(**generate_input)
50 |         generate_ids = [item[len(input_ids[0]):-1] for  item in generate_ids]
51 |         result_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
52 |         return result_message
53 | 


--------------------------------------------------------------------------------
/inference-speed/CPU/ggml/README.md:
--------------------------------------------------------------------------------
 1 | ## 使用llama.cpp量化部署
 2 | 
 3 | 以[llama.cpp工具](https://github.com/Rayrtfr/llama.cpp)为例，介绍模型量化并在本地部署的详细步骤。Windows则可能需要cmake等编译工具的安装。**本地快速部署体验推荐使用经过指令精调的[Atom-7B-Chat](https://github.com/LlamaFamily/Llama-Chinese?tab=readme-ov-file#%E5%9F%BA%E4%BA%8Ellama2%E7%9A%84%E4%B8%AD%E6%96%87%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8Batom)模型，有条件的推荐使用6-bit或者8-bit模型，效果更佳。** 运行前请确保：
 4 | 
 5 | 1. 系统应有`make`（MacOS/Linux自带）或`cmake`（Windows需自行安装）编译工具
 6 | 2. 建议使用Python 3.10以上编译和运行该工具
 7 | 
 8 | 
 9 | ### Step 1: 克隆和编译llama.cpp
10 | 
11 | 1. （可选）如果已下载旧版仓库，建议`git pull`拉取最新代码，**并执行`make clean`进行清理**
12 | 1. 拉取最新版适配过Atom大模型的llama.cpp仓库代码
13 | 
14 | ```bash
15 | $ git clone https://github.com/Rayrtfr/llama.cpp
16 | ```
17 | 
18 | 2. 对llama.cpp项目进行编译，生成`./main`（用于推理）和`./quantize`（用于量化）二进制文件。
19 | 
20 | ```bash
21 | $ make
22 | ```
23 | 
24 | **Windows/Linux用户**如需启用GPU推理，则推荐与[BLAS（或cuBLAS如果有GPU）一起编译](https://github.com/Rayrtfr/llama.cpp#blas-build)，可以提高prompt处理速度。以下是和cuBLAS一起编译的命令，适用于NVIDIA相关GPU。参考：[llama.cpp#blas-build](https://github.com/Rayrtfr/llama.cpp#blas-build)
25 | 
26 | ```bash
27 | $ make LLAMA_CUBLAS=1
28 | ```
29 | 
30 | **macOS用户**无需额外操作，llama.cpp已对ARM NEON做优化，并且已自动启用BLAS。M系列芯片推荐使用Metal启用GPU推理，显著提升速度。只需将编译命令改为：`LLAMA_METAL=1 make`，参考[llama.cpp#metal-build](https://github.com/Rayrtfr/llama.cpp#metal-build)
31 | 
32 | ```bash
33 | $ LLAMA_METAL=1 make
34 | ```
35 | 
36 | ###  Step 2: 生成量化版本模型
37 | 
38 | 目前llama.cpp已支持`.safetensors`文件以及huggingface格式`.bin`转换为GGUF的FP16格式。
39 | 
40 | /path/Atom-7B-Chat是模型下载的目录位置。
41 | ```bash
42 | $ python convert.py --outfile ./atom-7B-cpp.gguf  /path/Atom-7B-Chat
43 | 
44 | $ ./quantize ./atom-7B-cpp.gguf ./ggml-atom-7B-q4_0.gguf q4_0
45 | ```
46 | 
47 | ### Step 3: 加载并启动模型
48 | 
49 | 
50 | - 如果想使用GPU推理：cuBLAS/Metal编译需要指定offload层数，在`./main`中指定例如`-ngl 40`表示offload 40层模型参数到GPU
51 | 
52 | 
53 | 使用以下命令启动聊天。
54 | ```bash
55 | text="<s>Human: 介绍一下北京\n</s><s>Assistant:"
56 | ./main -m \
57 | ./ggml-atom-7B-q4_0.gguf \
58 | -p "${text}"  \
59 | --logdir ./logtxt 
60 | ```
61 | 如果要带聊天的上下文，上面的text需要调整成类似这样：
62 | ```bash
63 | text="<s>Human: 介绍一下北京\n</s><s>Assistant:北京是一个美丽的城市</s>\n<s>Human: 再介绍一下合肥\n</s><s>Assistant:"
64 | ```
65 | 
66 | 更详细的官方说明请参考：[https://github.com/ggerganov/llama.cpp/tree/master/examples/main](https://github.com/ggerganov/llama.cpp/tree/master/examples/main)
67 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/FasterTransformer_example/README.md:
--------------------------------------------------------------------------------
  1 | #  FasterTransformer &&  Triton 安装和使用
  2 | 
  3 | FasterTransformer & Triton 加速LLama2模型推理。 目前支持fp16或者Int8推理，Int4目前还不支持。
  4 | 
  5 | ## 0. 准备环境变量
  6 | 
  7 | ```bash
  8 | export BUILD_DICTIONARY="/workspace/build"
  9 | export TRITON_VERSION=23.04
 10 | ```
 11 | 
 12 | 
 13 | ## 一. 镜像构建
 14 | 
 15 | 
 16 | 1. 构建镜像
 17 | 
 18 | ```bash
 19 | cd $BUILD_DICTIONARY
 20 | git clone https://github.com/Rayrtfr/fastertransformer_backend.git 
 21 | 
 22 | cd $BUILD_DICTIONARY/fastertransformer_backend
 23 | 
 24 | export TRITON_VERSION=23.04
 25 | 
 26 | # 如何不想通过下面的命令构建，也可以直接下载我们已经构建好的镜像: docker pull xiangtao1994/atom_triton_ft:23.04
 27 | docker build --build-arg TRITON_VERSION=${TRITON_VERSION} -t triton_ft_backend:${TRITON_VERSION} -f docker/Dockerfile .
 28 | 
 29 | ```
 30 | TRITON_VERSION=23.04 这个镜像需的GPU的驱动版本是 Driver Version: 535.54.03，如果你的GPU的驱动不是这个版本，需要[https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12)
 31 | 找到cuda driver 对应版本的 triton-inference-server。
 32 | 
 33 | 
 34 | 2.启动容器
 35 | 
 36 | ```
 37 | # 启动容器
 38 | export TRITON_VERSION=23.04
 39 | 
 40 | # 注意需要 BUILD_DICTIONARY 挂载到容器里面
 41 | docker run -idt --gpus=all --net=host  --shm-size=4G --name triton_ft_backend_pure \
 42 |   -v $BUILD_DICTIONARY:$BUILD_DICTIONARY \
 43 |   -p18888:8888 -p18000:8000 -p18001:8001 -p18002:8002 triton_ft_backend:${TRITON_VERSION}  bash 
 44 | 
 45 | ````
 46 | 
 47 | ## 二.容器内操作
 48 | 
 49 | 下面介绍一下[Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)模型的权重转换成FasterTransformer格式。 [Llama2-Chinese-13b-Chat](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat)也是类似的方式。
 50 | 
 51 | 1. 转换权重, 权重转换成FasterTransformer格式
 52 | 
 53 | ```
 54 | cd $BUILD_DICTIONARY && git clone https://github.com/Rayrtfr/FasterTransformer.git
 55 | 
 56 | cd $BUILD_DICTIONARY/FasterTransformer
 57 | 
 58 | mkdir models && chmod -R 777 ./*
 59 | 
 60 | python3 ./examples/cpp/llama/huggingface_llama_convert.py \
 61 | -saved_dir=./models/llama \
 62 | -in_file=/path/FlagAlpha/Atom-7B-Chat \
 63 | -infer_gpu_num=1 \
 64 | -weight_data_type=fp16 \
 65 | -model_name=llama
 66 | ```
 67 | 
 68 | 2. 修改模型配置
 69 | 
 70 | - 编辑config.pbtxt
 71 | 
 72 | ``` bash
 73 | mkdir $BUILD_DICTIONARY/triton-model-store/
 74 | 
 75 | cd $BUILD_DICTIONARY/triton-model-store/
 76 | 
 77 | cp -r $BUILD_DICTIONARY/fastertransformer_backend/all_models/llama $BUILD_DICTIONARY/triton-model-store/
 78 | 
 79 | # 修改 triton-model-store/llama/fastertransformer/config.pbtxt
 80 | 
 81 | parameters {
 82 |   key: "tensor_para_size"
 83 |   value: {
 84 |     string_value: "1"
 85 |   }
 86 | }
 87 | 
 88 | ## 修改 model_checkpoint_path 为上面转换之后的路径
 89 | parameters {
 90 |   key: "model_checkpoint_path"
 91 |   value: {
 92 |     string_value: "/workspace/build/FasterTransformer/models/llama/1-gpu/"
 93 |   }
 94 | }
 95 | 
 96 | ## 模型使用int8推理需要加一下面的配置
 97 | parameters { 
 98 |   key: "int8_mode" 
 99 |   value: { 
100 |     string_value: "1"
101 |   } 
102 | }
103 | ```
104 | 
105 | 
106 | 修改 model.py
107 | 
108 | ```
109 | # 修改这两个文件
110 | triton-model-store/llama/preprocessing/1/model.py
111 | triton-model-store/llama/postprocessing/1/model.py
112 | 
113 | # 检查 这个路径为tokenier对应的路径
114 | self.tokenizer = LlamaTokenizer.from_pretrained("/path/FlagAlpha/Atom-7B-Chat")
115 | ```
116 | 
117 | 
118 | 3. 编译 FasterTransformer Library
119 | 
120 | (同一类型的模型，编译一次就行了)
121 | 编译之前检查 FasterTransformer/examples/cpp/llama/llama_config.ini
122 | 
123 | ```bash
124 | # 单卡推理这里是1，多卡可以改成卡的数目
125 | tensor_para_size=1
126 | 
127 | model_dir=/workspace/build/FasterTransformer/models/llama/1-gpu/
128 | ```
129 | 
130 | 编译 FasterTransformer
131 | ```bash
132 | cd $BUILD_DICTIONARY/FasterTransformer
133 | 
134 | git submodule init && git submodule update
135 | pip3 install fire jax jaxlib transformers
136 | 
137 | mkdir build && cd build
138 | cmake -DSM=86 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON -D PYTHON_PATH=/usr/bin/python3 ..
139 | make -j12
140 | make install
141 | ```
142 | 
143 | 
144 | ## 三. 启动 triton server
145 | 
146 | 同样在上面的容器内操作。
147 | ```
148 | CUDA_VISIBLE_DEVICES=0 /opt/tritonserver/bin/tritonserver  --model-repository=$BUILD_DICTIONARY/triton-model-store/llama/
149 | ```
150 | 输出
151 | ```
152 | I0717 17:17:14.670037 70681 grpc_server.cc:2450] Started GRPCInferenceService at 0.0.0.0:8001
153 | I0717 17:17:14.670495 70681 http_server.cc:3555] Started HTTPService at 0.0.0.0:8000
154 | I0717 17:17:14.713000 70681 http_server.cc:185] Started Metrics Service at 0.0.0.0:8002
155 | ```
156 | 
157 | 
158 | 同样在上面的容器内操作，启动client测试（如果在容器外注意需要修改下面的url参数的端口号）
159 | 
160 | ```
161 | python3 $BUILD_DICTIONARY/fastertransformer_backend/inference_example/llama/llama_grpc_stream_client.py \
162 | --url 127.0.0.1:8001 \
163 | --hf_model_location /path/FlagAlpha/Atom-7B-Chat \
164 | -topp 0.95
165 | ```
166 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/JittorLLMs_example/README.md:
--------------------------------------------------------------------------------
 1 | # JittorLLMs推理部署
 2 | 
 3 | ## 配置要求
 4 | 
 5 | * 内存要求：至少2G，推荐32G
 6 | * 显存：可选， 推荐16G
 7 | * 操作系统：支持Windows，Mac，Linux全平台。
 8 | * 磁盘空间：至少40GB空闲磁盘空间，用于下载参数和存储交换文件。
 9 | * Python版本要求至少`3.9`。
10 | 
11 | 磁盘空间不够时，可以通过环境变量`JITTOR_HOME`指定缓存存放路径。
12 | 内存或者显存不够，出现进程被杀死的情况，请参考下方，[限制内存消耗的方法](#配置要求低)。
13 | 
14 | ## 部署方法
15 | 
16 | 可以通过下述指令安装依赖。（注意：此脚本会安装Jittor版torch，推荐用户新建环境运行）
17 | 
18 | ```
19 | # 国内使用 gitlink clone
20 | git clone https://gitlink.org.cn/jittor/JittorLLMs.git --depth 1
21 | # github: git clone https://github.com/Jittor/JittorLLMs.git --depth 1
22 | cd JittorLLMs
23 | # -i 指定用jittor的源， -I 强制重装Jittor版torch
24 | pip install -r requirements.txt -i https://pypi.jittor.org/simple -I
25 | ```
26 | 
27 | 如果出现找不到jittor版本的错误，可能是您使用的镜像还没有更新，使用如下命令更新最新版：`pip install jittor -U -i https://pypi.org/simple`
28 | 
29 | 部署只需一行命令即可：
30 | 
31 | ```
32 | python cli_demo.py atom7b
33 | ```
34 | 
35 | 运行后会自动从服务器上下载模型文件到本地，会占用根目录下一定的硬盘空间。
36 | 最开始运行的时候会编译一些CUDA算子，这会花费一些时间进行加载。
37 | 
38 | 内存或者显存不够，出现进程被杀死的情况，请参考下方，[限制内存消耗的方法](#配置要求低)。
39 | 
40 | ### WebDemo
41 | 
42 | JittorLLM通过gradio库，允许用户在浏览器之中和大模型直接进行对话。
43 | 
44 | ~~~bash
45 | python web_demo.py atom7b
46 | ~~~
47 | 
48 | ### 后端服务部署
49 | 
50 | JittorLLM在api.py文件之中，提供了一个架设后端服务的示例。
51 | 
52 | ~~~bash
53 | python api.py atom7b
54 | ~~~
55 | 
56 | 接着可以使用如下代码进行直接访问
57 | 
58 | ~~~python
59 | post_data = json.dumps({'prompt': 'Hello, solve 5x=13'})
60 | print(json.loads(requests.post("http://0.0.0.0:8000", post_data).text)['response'])
61 | ~~~
62 | 
63 | ## 配置要求低
64 | 
65 | 针对大模型显存消耗大等痛点，Jittor团队研发了动态交换技术，Jittor框架是世界上首个支持动态图变量自动交换功能的框架，区别于以往的基于静态图交换技术，用户不需要修改任何代码，原生的动态图代码即可直接支持张量交换，张量数据可以在显存-内存-硬盘之间自动交换，降低用户开发难度。
66 | 
67 | 同时，Jittor大模型推理库也是目前对配置门槛要求最低的框架，只需要参数磁盘空间和2G内存，无需显卡，也可以部署大模型，下面是在不同硬件配置条件下的资源消耗与速度对比。可以发现，JittorLLMs在显存充足的情况下，性能优于同类框架，而显存不足甚至没有显卡，JittorLLMs都能以一定速度运行。
68 | 
69 | 节省内存方法，请安装Jittor版本大于1.3.7.8，并添加如下环境变量：
70 | ```bash
71 | export JT_SAVE_MEM=1
72 | # 限制cpu最多使用16G
73 | export cpu_mem_limit=16000000000
74 | # 限制device内存（如gpu、tpu等）最多使用8G
75 | export device_mem_limit=8000000000
76 | # windows 用户，请使用powershell
77 | # $env:JT_SAVE_MEM="1"
78 | # $env:cpu_mem_limit="16000000000"
79 | # $env:device_mem_limit="8000000000"
80 | ```
81 | 用户可以自由设定cpu和设备内存的使用量，如果不希望对内存进行限制，可以设置为`-1`。
82 | ```bash
83 | # 限制cpu最多使用16G
84 | export cpu_mem_limit=-1
85 | # 限制device内存（如gpu、tpu等）最多使用8G
86 | export device_mem_limit=-1
87 | # windows 用户，请使用powershell
88 | # $env:JT_SAVE_MEM="1"
89 | # $env:cpu_mem_limit="-1"
90 | # $env:device_mem_limit="-1"
91 | ```
92 | 
93 | 如果想要清理磁盘交换文件，可以运行如下命令
94 | ```bash
95 | python -m jittor_utils.clean_cache swap
96 | ```
97 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/TensorRT-LLM_example/README.md:
--------------------------------------------------------------------------------
 1 | # 使用NVIDIA TensorRT-LLM部署LLama2 或者Atom
 2 | 
 3 | [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架，您可以按照以下步骤来使用TensorRT-LLM部署LLama2模型或者Atom模型。
 4 | 
 5 | 以下部署流程参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)，需要机器Nvidia显卡驱动535版本以上
 6 | 
 7 | ## Support Matrix
 8 |   * FP16
 9 |   * FP8
10 |   * INT8 & INT4 Weight-Only
11 |   * SmoothQuant
12 |   * Groupwise quantization (AWQ/GPTQ)
13 |   * FP8 KV CACHE
14 |   * INT8 KV CACHE (+ AWQ/per-channel weight-only)
15 |   * Tensor Parallel
16 |   * STRONGLY TYPED
17 | 
18 | ## 1. 安装TensorRT-LLM
19 | #### 获取TensorRT-LLM代码：
20 | 
21 | ```bash
22 | # TensorRT-LLM 代码需要使用 git-lfs 拉取
23 | apt-get update && apt-get -y install git git-lfs
24 | 
25 | git clone https://github.com/NVIDIA/TensorRT-LLM.git
26 | cd TensorRT-LLM
27 | 
28 | # 本流程将使用 v0.7.0 Release 版本
29 | git checkout tags/v0.7.0 -b release/0.7.0
30 | git submodule update --init --recursive
31 | git lfs install
32 | git lfs pull
33 | ```
34 | #### 构建docker镜像并安装TensorRT-LLM
35 | ```bash
36 | make -C docker release_build
37 | ```
38 | 
39 | #### 运行docker镜像：
40 | ```bash
41 | make -C docker release_run
42 | ```
43 | 
44 | ## 2. 为LLama2模型构建TensorRT-LLM推理引擎：
45 | 
46 | #### 进入build文件夹：
47 | ```bash
48 | cd ./examples/llama
49 | ```
50 | 
51 | #### 从Huggingface下载Atom或者LLama2模型：
52 | ```
53 | # 您可以选择具体想部署的模型下载
54 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat      Atom-7B-Chat
55 | mv Atom-7B-Chat /origin_model
56 | ```
57 | 
58 | #### 使用build.py 构建推理引擎：
59 | 以下是一个常见事例，更多参数参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)
60 | ```bash
61 | python build.py --max_batch_size 1 --max_num_tokens 8192  --model_dir /origin_model --dtype float16  --remove_input_padding --use_inflight_batching --paged_kv_cache --use_weight_only --enable_context_fmha --use_gpt_attention_plugin float16  --use_gemm_plugin float16 --output_dir /model/tensorrt_llm/1 --world_size 1 --tp_size 1 --pp_size 1 --max_input_len 7168 --max_output_len 1024 --multi_block_mode --rotary_scaling dynamic 8.0 --rotary_base 500000
62 | ```
63 | 
64 | ## 3. 使用TensorRT-LLM Python Runtime进行推理
65 | 
66 | #### 使用我们提供的python代码类，启动单机单卡服务
67 | ```bash
68 | python atom_inference.py \
69 |     /model/tensorrt_llm/1 \   # 第一个参数 build.py 的output路径
70 |     /origin_model \          # 第二个参数模型tokenizer的路径
71 |     如何成为一个更加优秀的人    # 希望问的问题
72 | ```


--------------------------------------------------------------------------------
/inference-speed/GPU/TensorRT-LLM_example/atom_inference.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
  8 |                    load_tokenizer, read_model_name, throttle_generator)
  9 | 
 10 | import tensorrt_llm
 11 | from tensorrt_llm.logger import logger
 12 | from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
 13 | 
 14 | if PYTHON_BINDINGS:
 15 |     from tensorrt_llm.runtime import ModelRunnerCpp
 16 | 
 17 | class AtomTRTApi:
 18 |     def __init__(self,engine_dir,tokenizer_dir,max_input_length=4096):
 19 |         self.runtime_rank = tensorrt_llm.mpi_rank()
 20 |         self.model_name = read_model_name(engine_dir)
 21 | 
 22 |         self.tokenizer, self.pad_id, self.end_id = load_tokenizer(
 23 |             tokenizer_dir=tokenizer_dir,
 24 |             tokenizer_type='llama',
 25 |         )
 26 |         self.use_py_session=False
 27 |         if not PYTHON_BINDINGS:
 28 |             logger.warning(
 29 |                 "Python bindings of C++ session is unavailable, fallback to Python session."
 30 |             )
 31 |             self.use_py_session = True
 32 |         runner_cls = ModelRunner if self.use_py_session else ModelRunnerCpp
 33 |         runner_kwargs = dict(engine_dir=engine_dir,
 34 |                             lora_dir=None,
 35 |                             rank=self.runtime_rank,
 36 |                             debug_mode=False,
 37 |                             lora_ckpt_source='hf')
 38 |         
 39 |         if not self.use_py_session:
 40 |             runner_kwargs.update(
 41 |                 max_batch_size=1,
 42 |                 max_input_len=max_input_length,
 43 |                 max_output_len=2048,
 44 |                 max_beam_width=1,
 45 |                 max_attention_window_size=None)
 46 |         self.runner = runner_cls.from_dir(**runner_kwargs)
 47 | 
 48 | 
 49 |     def ask(self,input_text,temperature=0.4,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',merge_lambda=None,max_input_length=4096,append_next_role=True):
 50 |         with torch.no_grad():
 51 |             prompt = ''
 52 |             print('max_input_length',max_input_length)
 53 |             if type(input_text)==list:
 54 |                 for input_text_one in input_text[::-1]:
 55 |                     if len(prompt) + len("<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>")<max_input_length:
 56 |                         prompt = "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>" + prompt
 57 |                 if append_next_role:
 58 |                     if input_text[-1]['role']=='Human':
 59 |                         prompt += "<s>Assistant:"
 60 |                     else:
 61 |                         prompt += "<s>Human:"
 62 |             else:
 63 |                 if merge_lambda is None:
 64 |                     if append_next_role:
 65 |                         prompt +=  "<s>Human: "+input_text.strip()+"\n</s><s>Assistant:"
 66 |                     else:
 67 |                         prompt +=  "<s>Human: "+input_text.strip()+"\n</s>"
 68 |                 else:
 69 |                     prompt +=  merge_lambda(input_text)
 70 |             if len(system_prefix)>0:
 71 |                 prompt = '<s>System: '+system_prefix.strip()+'\n</s>'+prompt
 72 |             print('输入模型的完整输入:',prompt)
 73 |             input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids]
 74 |             print(input_ids)
 75 |             input_ids = [
 76 |                 torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids
 77 |             ]
 78 |             print('输入模型的token数量',input_ids[0].shape)
 79 |             generate_input = {
 80 |                 "batch_input_ids":input_ids,
 81 |                 "max_new_tokens":max_new_tokens,
 82 |                 "max_attention_window_size":None,
 83 |                 "do_sample":True,
 84 |                 "top_k":50,
 85 |                 "top_p":top_p,
 86 |                 "num_beams":1,
 87 |                 "length_penalty":1.0,
 88 |                 "stop_words_list":None,
 89 |                 "bad_words_list":None,
 90 |                 "streaming":False,
 91 |                 "temperature":temperature,
 92 |                 "output_sequence_lengths":True,
 93 |                 "return_dict":False,
 94 |                 "repetition_penalty":repetition_penalty,
 95 |                 "end_id":self.tokenizer.eos_token_id,
 96 |                 "bos_token_id":self.tokenizer.bos_token_id,
 97 |                 "pad_id":self.tokenizer.pad_token_id
 98 |             }
 99 |             generate_ids = self.runner.generate(**generate_input)
100 |             torch.cuda.synchronize()
101 |             print(generate_ids)
102 |             generate_ids = generate_ids.cpu().tolist()
103 |             generate_ids = [item[0][len(input_ids[0][0]):] for  item in generate_ids]
104 |             try:
105 |                 generate_ids = [item[:item.index(self.tokenizer.eos_token_id)] for  item in generate_ids ]
106 |             except:
107 |                 pass
108 |             print(generate_ids)
109 |             # output = ''.join(tokenizer.convert_ids_to_tokens(generate_ids[0]))
110 |             # print('生成的token长度',len(generate_ids[0]))
111 |             bot_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
112 |             if 'Human:' in bot_message:
113 |                 bot_message = bot_message.split('Human:')[0]
114 |             print(bot_message)
115 |             return bot_message.strip()
116 |         
117 |     def ask_streaming(self,input_text,temperature=0.8,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',max_input_length=4096,append_next_role=True):
118 |         with torch.no_grad():
119 |             prompt = ''
120 |             print('max_input_length',max_input_length)
121 |             if type(input_text)==list:
122 |                 for input_text_one in input_text[::-1]:
123 |                     if len(prompt) + len("<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>")<max_input_length:
124 |                         prompt = "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>" + prompt
125 |                 if append_next_role:
126 |                     if input_text[-1]['role']=='Human':
127 |                         prompt += "<s>Assistant:"
128 |                     else:
129 |                         prompt += "<s>Human:"
130 |             else:
131 |                 if append_next_role:
132 |                     prompt +=  "<s>Human: "+input_text.strip()+"\n</s><s>Assistant:"
133 |                 else:
134 |                     prompt +=  "<s>Human: "+input_text.strip()+"\n</s>"
135 |             if len(system_prefix)>0:
136 |                 prompt = '<s>System: '+system_prefix.strip()+'\n</s>'+prompt
137 |             print('输入模型的完整输入:',prompt)
138 |             input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids]
139 |             print(input_ids)
140 |             input_ids = [
141 |                 torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids
142 |             ]
143 |             print('输入模型的token数量',input_ids[0].shape)
144 |             generate_input = {
145 |                 "batch_input_ids":input_ids,
146 |                 "max_new_tokens":max_new_tokens,
147 |                 "max_attention_window_size":None,
148 |                 "do_sample":True,
149 |                 "top_k":50,
150 |                 "top_p":top_p,
151 |                 "num_beams":1,
152 |                 "length_penalty":1.0,
153 |                 "stop_words_list":None,
154 |                 "bad_words_list":None,
155 |                 "streaming":True,
156 |                 "temperature":temperature,
157 |                 "output_sequence_lengths":True,
158 |                 "return_dict":True,
159 |                 "repetition_penalty":repetition_penalty,
160 |                 "end_id":self.tokenizer.eos_token_id,
161 |                 "bos_token_id":self.tokenizer.bos_token_id,
162 |                 "pad_id":self.tokenizer.pad_token_id
163 |             }
164 |             generate_ids = self.runner.generate(**generate_input)
165 |             torch.cuda.synchronize()
166 |             
167 |             input_token_num = len(input_ids[0][0])
168 |             answer_message =''
169 |             for curr_outputs in throttle_generator(generate_ids,2):
170 |                 output_ids = curr_outputs['output_ids']
171 |                 sequence_lengths = curr_outputs['sequence_lengths']
172 |                 # print(sequence_lengths)
173 |                 output_ids = output_ids.cpu().tolist()
174 |                 output_ids = [item[0][input_token_num:sequence_lengths[0][0]] for  item in output_ids]
175 |                 answer_message = self.tokenizer.batch_decode(output_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
176 |                 if 'Human:' in answer_message:
177 |                     answer_message = answer_message.split('Human:')[0]                
178 |                 yield answer_message.strip()
179 |             return answer_message.strip()
180 |             
181 |             
182 | if __name__=='__main__':
183 |     model = AtomTRTApi(engine_dir=sys.argv[1],tokenizer_dir=sys.argv[2])
184 |     model.ask('如何成为一个更优秀的人')
185 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/TensorRT-LLM_example/utils.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import json
 17 | from pathlib import Path
 18 | from typing import Optional
 19 | from typing import Union
 20 | 
 21 | from transformers import AutoTokenizer, T5Tokenizer
 22 | 
 23 | import tensorrt_llm
 24 | 
 25 | DEFAULT_HF_MODEL_DIRS = {
 26 |     'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
 27 |     'bloom': 'bigscience/bloom-560m',
 28 |     'chatglm_6b': 'THUDM/chatglm-6b',
 29 |     'chatglm2_6b': 'THUDM/chatglm2-6b',
 30 |     'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k',
 31 |     'chatglm3_6b': 'THUDM/chatglm3-6b',
 32 |     'chatglm3_6b_base': 'THUDM/chatglm3-6b-base',
 33 |     'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k',
 34 |     'falcon': 'tiiuae/falcon-rw-1b',
 35 |     'glm_10b': 'THUDM/glm-10b',
 36 |     'gpt': 'gpt2-medium',
 37 |     'gptj': 'EleutherAI/gpt-j-6b',
 38 |     'gptneox': 'EleutherAI/gpt-neox-20b',
 39 |     'internlm': 'internlm/internlm-chat-7b',
 40 |     'llama': 'meta-llama/Llama-2-7b-hf',
 41 |     'mpt': 'mosaicml/mpt-7b',
 42 |     'phi': 'microsoft/phi-2',
 43 |     'opt': 'facebook/opt-350m',
 44 |     'qwen': 'Qwen/Qwen-7B',
 45 | }
 46 | 
 47 | DEFAULT_PROMPT_TEMPLATES = {
 48 |     'internlm':
 49 |     "<|User|>:{input_text}<eoh>\n<|Bot|>:",
 50 |     'qwen':
 51 |     "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
 52 | }
 53 | 
 54 | def get_engine_version(engine_dir: str) -> Union[None, str]:
 55 |     engine_dir = Path(engine_dir)
 56 |     config_path = engine_dir / "config.json"
 57 |     with open(config_path, 'r') as f:
 58 |         config = json.load(f)
 59 | 
 60 |     if 'version' not in config:
 61 |         return None
 62 | 
 63 |     return config['version']
 64 | 
 65 | def read_model_name(engine_dir: str):
 66 |     engine_version = get_engine_version(engine_dir)
 67 | 
 68 |     with open(Path(engine_dir) / "config.json", 'r') as f:
 69 |         config = json.load(f)
 70 | 
 71 |     if engine_version is None:
 72 |         return config['builder_config']['name']
 73 | 
 74 |     return config['pretrained_config']['architecture']
 75 | 
 76 | 
 77 | def throttle_generator(generator, stream_interval):
 78 |     for i, out in enumerate(generator):
 79 |         if not i % stream_interval:
 80 |             yield out
 81 | 
 82 |     if i % stream_interval:
 83 |         yield out
 84 | 
 85 | 
 86 | def load_tokenizer(tokenizer_dir: Optional[str] = None,
 87 |                    vocab_file: Optional[str] = None,
 88 |                    model_name: str = 'gpt',
 89 |                    tokenizer_type: Optional[str] = None):
 90 |     if vocab_file is None:
 91 |         use_fast = True
 92 |         if tokenizer_type is not None and tokenizer_type == "llama":
 93 |             use_fast = False
 94 |         # Should set both padding_side and truncation_side to be 'left'
 95 |         tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
 96 |                                                   legacy=False,
 97 |                                                   padding_side='left',
 98 |                                                   truncation_side='left',
 99 |                                                   trust_remote_code=True,
100 |                                                   tokenizer_type=tokenizer_type,
101 |                                                   use_fast=use_fast)
102 |     else:
103 |         # For gpt-next, directly load from tokenizer.model
104 |         assert model_name == 'gpt'
105 |         tokenizer = T5Tokenizer(vocab_file=vocab_file,
106 |                                 padding_side='left',
107 |                                 truncation_side='left')
108 | 
109 |     if model_name == 'qwen':
110 |         with open(Path(tokenizer_dir) / "generation_config.json") as f:
111 |             gen_config = json.load(f)
112 |         chat_format = gen_config['chat_format']
113 |         if chat_format == 'raw':
114 |             pad_id = gen_config['pad_token_id']
115 |             end_id = gen_config['eos_token_id']
116 |         elif chat_format == 'chatml':
117 |             pad_id = tokenizer.im_end_id
118 |             end_id = tokenizer.im_end_id
119 |         else:
120 |             raise Exception(f"unknown chat format: {chat_format}")
121 |     elif model_name == 'glm_10b':
122 |         pad_id = tokenizer.pad_token_id
123 |         end_id = tokenizer.eop_token_id
124 |     else:
125 |         if tokenizer.pad_token_id is None:
126 |             tokenizer.pad_token_id = tokenizer.eos_token_id
127 |         pad_id = tokenizer.pad_token_id
128 |         end_id = tokenizer.eos_token_id
129 | 
130 |     return tokenizer, pad_id, end_id
131 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/lmdeploy_example/README.md:
--------------------------------------------------------------------------------
  1 | #  lmdeploy 安装和使用
  2 | 
  3 | lmdeploy 支持 transformer 结构（例如 Atom、LLaMA、LLaMa2、InternLM、Vicuna 等），目前支持 fp16，int8 和 int4。
  4 | 
  5 | ## 一、安装
  6 | 
  7 | 安装预编译的 python 包
  8 | ```
  9 | python3 -m pip install lmdeploy==0.2.1
 10 | ```
 11 | 
 12 | ## 二、转换huggingface模型为lmdeploy格式
 13 | 
 14 | 把模型转成 lmdeploy 推理格式，假设 huggingface 版 [Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat) 模型已下载到 `/models/Atom-7B-Chat` 目录，结果会存到 当前执行命令的`workspace` 文件夹
 15 | 
 16 | ```shell
 17 | lmdeploy convert llama2 /models/Atom-7B-Chat
 18 | ```
 19 | lmdeploy 修改一处bug
 20 | ```
 21 | sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/preprocessing/1/tokenizer/tokenizer.py
 22 | sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/postprocessing/1/tokenizer/tokenizer.py
 23 | ```
 24 | 
 25 | 
 26 | ## 三、kv cache int8 量化
 27 | 对于最大长度是 2048 的 Atom-7B fp16 模型，服务端每创建 1 个并发，都需要大约 1030MB 显存保存 kv_cache，即便是 A100 80G，能服务的用户也非常有限。
 28 | 为了降低运行时显存，lmdeploy 实现了 kv cache PTQ 量化，同样的显存可以服务更多并发用户。
 29 | 首先计算模型参数，保存到临时目录 atom
 30 | ```shell
 31 | mkdir atom
 32 | lmdeploy lite calibrate \
 33 |   /models/Atom-7B-Chat  \             # huggingface Atom 模型。也支持 llama/vicuna/internlm/baichuan 等
 34 |   --calib-dataset 'ptb' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
 35 |   --calib-samples 128   \             # 校准集的样本数，如果显存不够，可以适当调小
 36 |   --device 'cuda'       \             # 单条的文本长度，如果显存不够，可以适当调小
 37 |   --work-dir atom                     # 保存 pth 格式量化统计参数和量化后权重的文件夹
 38 | ```
 39 | 注意：可能需要安装flash_attn
 40 | ```shell
 41 | conda install -c nvidia cuda-nvcc # 为了使用conda内的cuda环境安装 flash_attn
 42 | pip install flash_attn
 43 | ```
 44 | 
 45 | 
 46 | 然后用 atom 目录里的参数，计算量化参数，保存到转换后参数到 `workspace/triton_models/weights` 下
 47 | 
 48 | ```shell
 49 | lmdeploy lite kv_qparams                 \ 
 50 |   ./atom                                 \  # 上一步计算的 atom 结果
 51 |   ./workspace/triton_models/weights      \  # 结果保存目录
 52 |   --num-tp 1                                # tensor parallel GPU 个数
 53 | ```
 54 | 
 55 | 修改推理配置，开启 kv cache int8。编辑 `workspace/triton_models/weights/config.ini` 
 56 | * 把 `use_context_fmha` 改为 0，表示关闭 flashattention
 57 | * 把 `quant_policy` 设为 4，表示打开 kv cache 量化
 58 | 
 59 | 最终执行测试即可
 60 | ```shell
 61 | lmdeploy chat turbomind ./workspace
 62 | ```
 63 | 
 64 | [点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/kv_int8.md) 查看 kv cache int8 量化实现公式、精度和显存测试报告。
 65 | 
 66 | ## 四、weight int4 量化
 67 | 
 68 | lmdeploy 基于 [AWQ 算法](https://arxiv.org/abs/2306.00978) 实现了 weight int4 量化，性能是 FP16 的 2.4 倍以上。显存从 16G 降低到 6.3G。
 69 | 
 70 | 对于自己的模型，可以用`auto_awq`工具来优化
 71 | ```shell
 72 | # 指定量化导出的模型路径
 73 | WORK_DIR="./atom-7b-chta-w4"
 74 | 
 75 | lmdeploy lite auto_awq \
 76 | $HF_MODEL              \  # huggingface 模型位置
 77 | --calib-dataset 'ptb'  \  # 校准数据集，支持 c4, ptb, wikitext2, pileval
 78 | --calib-samples 128    \  # 校准集的样本数，如果显存不够，可以适当调小
 79 | --calib-seqlen 2048    \  # 单条的文本长度，如果显存不够，可以适当调小  
 80 | --w-bits 4             \  # 权重量化的 bit 数
 81 | --w-group-size 128     \  # 权重量化分组统计尺寸
 82 | --work-dir $WORK_DIR  
 83 | ```
 84 | 
 85 | 执行以下命令，启动服务：
 86 | ```shell
 87 | # 这里的路径是上面步骤一中转换模型的layout的输出
 88 | FasterTransformer_PATH="/path/workspace"
 89 | 
 90 | TP=1
 91 | # 指定需要用的显卡
 92 | DEVICES="0"
 93 | for ((i = 1; i < ${TP}; ++i)); do
 94 |     DEVICES="${DEVICES},$i"
 95 | done
 96 | DEVICES="\"device=${DEVICES}\""
 97 | 
 98 | # 在容器内启动服务
 99 | docker run -idt \
100 |         --gpus $DEVICES \
101 |         -v $FasterTransformer_PATH:/workspace/models \
102 |         --shm-size 16g \
103 |         -p 33336:22 \
104 |         -p 33337-33400:33337-33400 \
105 |         --cap-add=SYS_PTRACE \
106 |         --cap-add=SYS_ADMIN \
107 |         --security-opt seccomp=unconfined \
108 |         --name lmdeploy \
109 |         --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
110 |         tritonserver \
111 |         --model-repository=/workspace/models/model_repository \
112 |         --allow-http=0 \
113 |         --allow-grpc=1 \
114 |         --grpc-port=33337 \
115 |         --log-verbose=0 \
116 |         --allow-metrics=1
117 | ```
118 | 
119 | 客户端测试：
120 | ```shell
121 | python test_api_server.py  --tritonserver_addr 127.0.0.1:33337
122 | ```
123 | 
124 | [点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md) 查看 weight int4 量化的显存和速度测试结果。
125 | 
126 | 额外说明，weight int4 和 kv cache int8 二者并不冲突、可以同时打开，节约更多显存。
127 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/lmdeploy_example/test_api_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from lmdeploy.serve.turbomind.chatbot import Chatbot
 4 | 
 5 | def input_prompt(chat_history, system_prompt: str):
 6 |     """Input a prompt in the consolo interface."""
 7 |     prompt = ''
 8 |     for input_text_one in chat_history:
 9 |             prompt += "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>"
10 |     if chat_history[-1]['role']=='Human':
11 |         prompt += "<s>Assistant: "
12 |     else:
13 |         prompt += "<s>Human: "
14 |     prompt = prompt[-2048:]
15 |     if len(system_prompt)>0:
16 |         prompt = '<s>System: '+system_prompt.strip()+'\n</s>'+prompt
17 |                 
18 |     return prompt
19 | 
20 | def main(tritonserver_addr: str,
21 |          session_id: int = 1,
22 |          cap: str = 'chat',
23 |          stream_output: bool = True,
24 |          **kwargs):
25 |     """An example to communicate with inference server through the command line
26 |     interface.
27 | 
28 |     Args:
29 |         tritonserver_addr (str): the address in format "ip:port" of
30 |           triton inference server
31 |         session_id (int): the identical id of a session
32 |         cap (str): the capability of a model. For example, codellama has
33 |             the ability among ['completion', 'infill', 'instruct', 'python']
34 |         stream_output (bool): indicator for streaming output or not
35 |         **kwargs (dict): other arguments for initializing model's chat template
36 |     """
37 |     log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
38 |     kwargs.update(capability=cap)
39 |     chatbot = Chatbot(tritonserver_addr,
40 |                       log_level=log_level,
41 |                       display=stream_output,
42 |                       **kwargs)
43 |     nth_round = 1
44 |     prompt = input_prompt([{"role": "Human", "content" : "心情不好怎么办"}], "")
45 | 
46 |     request_id = f'{session_id}-{nth_round}'
47 |     begin = time.time()
48 |     if stream_output:
49 |         for status, res, n_token in chatbot.stream_infer(
50 |                 session_id,
51 |                 prompt,
52 |                 request_id=request_id,
53 |                 request_output_len=512):
54 |             # print("n_token:", n_token)
55 |             continue
56 |             
57 |     else:
58 |         status, res, n_token = chatbot.infer(session_id,
59 |                                                 prompt,
60 |                                                 request_id=request_id,
61 |                                                 request_output_len=512)
62 |         print(res)
63 |         # print("n_token:", n_token)
64 |     nth_round += 1
65 |     end = time.time()
66 |     speed = n_token/(end-begin)
67 |     print("speed {} tokens/s".format(speed))
68 |     
69 | 
70 | if __name__ == '__main__':
71 |     import fire
72 | 
73 |     fire.Fire(main)
74 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/README.md:
--------------------------------------------------------------------------------
 1 | # vllm推理部署
 2 | 
 3 | [vllm](https://github.com/vllm-project/vllm)同样是GPU推理的方案。相比较与FasterTrainsformer，vllm更加的简单易用。不需要额外进行模型的转换。支持fp16推理。
 4 | 
 5 | 特点：
 6 | 
 7 | + 快速的推理速度
 8 | + 高效的kv cache
 9 | + 连续的batch请求推理
10 | + 优化cuda算子
11 | + 支持分布式推理
12 | 
13 | ## 第一步： 安装vllm
14 | 
15 | ```bash
16 | pip install vllm
17 | ```
18 | 
19 | ## 第二步：启动测试server
20 | 
21 | 从Huggingface下载Atom或者LLama3模型：
22 | ```
23 | # 您可以选择具体想部署的模型下载
24 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat  Atom-7B-Chat
25 | 
26 | # 或者下载Meta官方的Llama3模型：
27 | git clone https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct Meta-Llama-3-8B-Instruct
28 | ```
29 | 
30 | 1. 单卡推理
31 | 
32 | 编辑single_gpus_api_server.sh里面model为上面模型的下载路径。
33 | 
34 | 启动测试server
35 | ```bash
36 | # multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡
37 | bash single_gpus_api_server.sh
38 | ```
39 | 
40 | 2. 多卡推理
41 | 
42 | 13B模型，70B模型推荐多卡推理。编辑multi_gpus_api_server.sh里面model为上面的13B模型的下载路径。
43 | 
44 | 启动测试server
45 | ```bash
46 | # multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡
47 | # tensor-parallel-size 指定了卡的个数
48 | bash multi_gpus_api_server.sh
49 | ```
50 | 
51 | ## 第三步：启动client测试
52 | 
53 | 注意下面的model_source 模型的源，可以是 llama_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分，如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama_chinese。
54 | 
55 | ```
56 | python client_test.py --model_source llama_chinese
57 | ```
58 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/api_server.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from typing import AsyncGenerator
 4 | 
 5 | from fastapi import BackgroundTasks, FastAPI, Request
 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse
 7 | import uvicorn
 8 | 
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | from vllm.sampling_params import SamplingParams
12 | from vllm.utils import random_uuid
13 | 
14 | TIMEOUT_KEEP_ALIVE = 5  # seconds.
15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
16 | app = FastAPI()
17 | 
18 | 
19 | @app.post("/generate")
20 | async def generate(request: Request) -> Response:
21 |     """Generate completion for the request.
22 | 
23 |     The request should be a JSON object with the following fields:
24 |     - prompt: the prompt to use for the generation.
25 |     - stream: whether to stream the results or not.
26 |     - other fields: the sampling parameters (See `SamplingParams` for details).
27 |     """
28 |     request_dict = await request.json()
29 |     prompt = request_dict.pop("prompt")
30 |     stream = request_dict.pop("stream", False)
31 |     sampling_params = SamplingParams(**request_dict)
32 |     request_id = random_uuid()
33 |     results_generator = engine.generate(prompt, sampling_params, request_id)
34 | 
35 |     # Streaming case
36 |     async def stream_results() -> AsyncGenerator[bytes, None]:
37 |         async for request_output in results_generator:
38 |             prompt = request_output.prompt
39 |             text_outputs = [
40 |                 prompt + output.text for output in request_output.outputs
41 |             ]
42 |             ret = {"text": text_outputs}
43 |             yield (json.dumps(ret) + "\0").encode("utf-8")
44 | 
45 |     async def abort_request() -> None:
46 |         await engine.abort(request_id)
47 | 
48 |     if stream:
49 |         background_tasks = BackgroundTasks()
50 |         # Abort the request if the client disconnects.
51 |         background_tasks.add_task(abort_request)
52 |         return StreamingResponse(stream_results(), background=background_tasks)
53 | 
54 |     # Non-streaming case
55 |     final_output = None
56 |     async for request_output in results_generator:
57 |         if await request.is_disconnected():
58 |             # Abort the request if the client disconnects.
59 |             await engine.abort(request_id)
60 |             return Response(status_code=499)
61 |         final_output = request_output
62 | 
63 |     assert final_output is not None
64 |     prompt = final_output.prompt
65 |     text_outputs = [prompt + output.text for output in final_output.outputs]
66 |     ret = {"text": text_outputs}
67 |     return JSONResponse(ret)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     parser = argparse.ArgumentParser()
72 |     parser.add_argument("--host", type=str, default="0.0.0.0")
73 |     parser.add_argument("--port", type=int, default=8090)
74 |     parser.add_argument("--trust_remote_code", type=bool, default=True)
75 |     parser = AsyncEngineArgs.add_cli_args(parser)
76 |     args = parser.parse_args()
77 | 
78 |     engine_args = AsyncEngineArgs.from_cli_args(args)
79 |     engine = AsyncLLMEngine.from_engine_args(engine_args)
80 | 
81 |     uvicorn.run(app,
82 |                 host=args.host,
83 |                 port=args.port,
84 |                 log_level="debug",
85 |                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
86 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/client_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import json
  3 | import time
  4 | import argparse
  5 | 
  6 | import urllib.request
  7 | 
  8 | import sys
  9 | 
 10 | parser = argparse.ArgumentParser()
 11 | parser.add_argument('--model_source', default="llama_chinese", choices =["llama_chinese", "llama2_meta", "llama3_meta"], required=False,type=str)
 12 | args = parser.parse_args()
 13 | 
 14 | def get_prompt_llama_chinese(
 15 |     chat_history, system_prompt=""
 16 | ) -> str:
 17 |     prompt = ''
 18 |     for input_text_one in chat_history:
 19 |             prompt += "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>"
 20 |     if chat_history[-1]['role']=='Human':
 21 |         prompt += "<s>Assistant: "
 22 |     else:
 23 |         prompt += "<s>Human: "
 24 |     prompt = prompt[-2048:]
 25 |     if len(system_prompt)>0:
 26 |         prompt = '<s>System: '+system_prompt.strip()+'\n</s>'+prompt
 27 |                 
 28 |     return prompt
 29 | 
 30 | def get_prompt_llama2_meta(chat_history, system_prompt=""):
 31 |     B_INST, E_INST = "[INST]", "[/INST]"
 32 |     B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 33 |     
 34 |     sep = " "
 35 |     sep2 =" </s><s>"
 36 |     stop_token_ids = [2]
 37 |     system_template = f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
 38 |     roles = ("[INST]", "[/INST]")
 39 |     seps = [sep, sep2]
 40 |     if system_prompt.strip() != "":
 41 |         ret = system_template
 42 |     else:
 43 |         ret = "[INST] "
 44 |     for i, chat in enumerate(chat_history):
 45 |         message = chat["content"]
 46 |         role = chat["role"]
 47 |         if message:
 48 |             if i == 0:
 49 |                 ret += message + " "
 50 |             else:
 51 |                 if role == "Human":
 52 |                     ret +=  "[INST]" + " " + message + seps[i % 2]
 53 |                 else:
 54 |                     ret +=  "[/INST]" + " " + message + seps[i % 2]
 55 |         else:
 56 |             if role == "Human":
 57 |                 ret += "[INST]"
 58 |             else:
 59 |                 ret += "[/INST]"
 60 |     print("prompt:{}".format(ret))
 61 |     return ret
 62 | 
 63 | def get_prompt_llama3_meta(chat_history, system_prompt=""):
 64 |     system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
 65 |     user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>'
 66 |     assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n'
 67 |     prompt_str = ''
 68 |     # 拼接历史对话
 69 |     for item in chat_history:
 70 |         if item['role']=='Human':
 71 |             prompt_str+=user_format.format(content=item['content'])
 72 |         else:
 73 |             prompt_str+=assistant_format.format(content=item['content'])
 74 |     if len(system_prompt)>0:
 75 |         prompt_str = system_format.format(content=system_prompt) + prompt_str
 76 |     prompt_str = "<|begin_of_text|>" + prompt_str
 77 |     return prompt_str
 78 | 
 79 | 
 80 | def test_api_server(chat_history=[], system_prompt=""):
 81 |     header = {'Content-Type': 'application/json'}
 82 | 
 83 |     if args.model_source == "llama2_meta":
 84 |         prompt = get_prompt_llama2_meta(chat_history, system_prompt)
 85 |     elif args.model_source == "llama3_meta":
 86 |         prompt = get_prompt_llama3_meta(chat_history, system_prompt)
 87 |     else:
 88 |         prompt = get_prompt_llama_chinese(chat_history, system_prompt)
 89 | 
 90 |     data = {
 91 |           "prompt": prompt,
 92 |           "stream" : False,
 93 |           "n" : 1,
 94 |           "best_of": 1, 
 95 |           "presence_penalty": 0.0, 
 96 |           "frequency_penalty": 0.2, 
 97 |           "temperature": 0.3, 
 98 |           "top_p" : 0.95, 
 99 |           "top_k": 50, 
100 |           "use_beam_search": False, 
101 |           "stop": [], 
102 |           "ignore_eos" :False, 
103 |           "max_tokens": 2048, 
104 |           "logprobs": None
105 |     }
106 |     request = urllib.request.Request(
107 |         url='http://127.0.0.1:8090/generate',
108 |         headers=header,
109 |         data=json.dumps(data).encode('utf-8')
110 |     )
111 | 
112 |     result = None
113 |     try:
114 |         response = urllib.request.urlopen(request, timeout=300)
115 |         res = response.read().decode('utf-8')
116 |         result = json.loads(res)
117 |         print(json.dumps(data, ensure_ascii=False, indent=2))
118 |         print(json.dumps(result, ensure_ascii=False, indent=2))
119 | 
120 |     except Exception as e:
121 |         print(e)
122 | 
123 |     return result
124 | 
125 | if __name__ == "__main__":
126 |     # 多伦对话测试
127 |     """ 多伦对话测试
128 |         last_question = "怎么回来呢"
129 |         inputs = [{"role": "Human", "content": "如何去北京"}, 
130 |                 {"role": "Assitant", "content": "乘坐飞机或者轮船"}, 
131 |                 {"role" : "Human", "content": last_question}]
132 |     """
133 |     # 单轮对话  
134 |     last_question = "怎么去北京"
135 |     chat_history = [ {"role" : "Human", "content": last_question}]
136 |     test_api_server(chat_history)
137 | 
138 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/multi_gpus_api_server.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1 python api_server.py \
2 | --model "./Atom-7B-Chat" \
3 | --port 8090 \
4 | --tensor-parallel-size 2
5 | 


--------------------------------------------------------------------------------
/inference-speed/GPU/vllm_example/single_gpu_api_server.sh:
--------------------------------------------------------------------------------
1 | 
2 | CUDA_VISIBLE_DEVICES=0 python api_server.py \
3 | --model "./Atom-7B-Chat" \
4 | --port 8090
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.1.2
 2 | bitsandbytes==0.42.0
 3 | accelerate==0.27.2
 4 | numpy==1.26.4
 5 | gekko==1.0.6
 6 | pandas
 7 | scipy
 8 | sentencepiece==0.2.0
 9 | datasets
10 | evaluate
11 | pytest
12 | peft==0.8.2
13 | transformers==4.39.0
14 | deepspeed==0.14.0
15 | scikit-learn
16 | torchvision
17 | torchdata
18 | torchaudio
19 | tensorboard
20 | gradio
21 | packaging


--------------------------------------------------------------------------------
/scripts/api/README.md:
--------------------------------------------------------------------------------
 1 | # API 调用
 2 | 
 3 | ```
 4 | 您可以选择具体想部署的模型下载
 5 | git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat   Atom-7B-Chat
 6 | mv Atom-7B-Chat /path/origin_model
 7 | ```
 8 | 
 9 | 首先需要安装额外的依赖 `pip install fastapi uvicorn`，然后运行仓库中的 [accelerate_server.py](accelerate_server.py)：
10 | 
11 | ```bash
12 | python accelerate_server.py \
13 | --model_path /path/origin_model \
14 | --gpus "0" \
15 | --infer_dtype "int8" \
16 | --model_source "llama2_chinese"
17 | ```
18 | 参数说明：
19 | - model_path 模型的本地路径
20 | - gpus 使用的显卡编号，类似"0"、 "0,1"
21 | - infer_dtype 模型加载后的参数数据类型，可以是 int8, float16
22 | - model_source 模型的源，可以是llama2_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分，如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama2_chinese。
23 | 
24 | 
25 | 默认部署在本地的 8001 端口，通过 POST 方法进行调用
26 | 
27 | ```bash
28 | python accelerate_client.py
29 | ```
30 | 


--------------------------------------------------------------------------------
/scripts/api/accelerate_client.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import json
 3 | import time
 4 | import urllib.request
 5 | import sys
 6 | 
 7 | def test_api_server(input_text):
 8 |     header = {'Content-Type': 'application/json'}
 9 | 
10 |     data = {
11 |           "system_prompt": "",
12 |           "history": inputs,
13 |           "n" : 1,
14 |           "best_of": 1, 
15 |           "presence_penalty": 1.2, 
16 |           "frequency_penalty": 0.2, 
17 |           "temperature": 0.3, 
18 |           "top_p" : 0.95, 
19 |           "top_k": 50, 
20 |           "use_beam_search": False, 
21 |           "stop": [], 
22 |           "ignore_eos" :False, 
23 |           "logprobs": None,
24 |           "max_new_tokens": 2048, 
25 |     }
26 |     request = urllib.request.Request(
27 |         url='http://127.0.0.1:8001/generate',
28 |         headers=header,
29 |         data=json.dumps(data).encode('utf-8')
30 |     )
31 | 
32 |     result = None
33 |     try:
34 |         response = urllib.request.urlopen(request, timeout=300)
35 |         res = response.read().decode('utf-8')
36 |         result = json.loads(res)
37 |         print(json.dumps(data, ensure_ascii=False, indent=2))
38 |         print(json.dumps(result, ensure_ascii=False, indent=2))
39 | 
40 |     except Exception as e:
41 |         print(e)
42 | 
43 |     return result
44 | 
45 | if __name__ == "__main__":
46 |     
47 |     # 多伦对话测试
48 |     """ 多伦对话测试
49 |         last_question = "怎么回来呢"
50 |         inputs = [{"role": "Human", "content": "如何去北京"}, 
51 |                 {"role": "Assitant", "content": "乘坐飞机或者轮船"}, 
52 |                 {"role" : "Human", "content": last_question}]
53 |     """
54 |     # 单轮对话  
55 |     last_question = "怎么去北京"
56 |     inputs = [ {"role" : "Human", "content": last_question}]
57 |     
58 |     test_api_server(inputs)
59 | 
60 | 


--------------------------------------------------------------------------------
/scripts/api/accelerate_server.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import argparse
  3 | import gc
  4 | import math
  5 | import os
  6 | import time
  7 | 
  8 | from fastapi import FastAPI, Request
  9 | from transformers import AutoTokenizer, AutoModel
 10 | import uvicorn, json, datetime
 11 | import torch
 12 | import torch.distributed as dist
 13 | 
 14 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('--model_path',required=True,type=str)
 18 | parser.add_argument('--gpus', default="0", type=str)
 19 | parser.add_argument('--infer_dtype', default="int8", choices=["int4", "int8", "float16"], required=False,type=str)
 20 | parser.add_argument('--model_source', default="llama2_chinese", choices =["llama2_chinese", "llama2_meta", "llama3_meta"], required=False,type=str)
 21 | 
 22 | args = parser.parse_args()
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
 24 | 
 25 | local_rank = int(os.getenv("LOCAL_RANK", "0"))
 26 | world_size = torch.cuda.device_count()
 27 | 
 28 | rank = local_rank
 29 | 
 30 | app = FastAPI()
 31 | 
 32 | def get_prompt_llama2chinese(
 33 |     chat_history, system_prompt=""
 34 | ) -> str:
 35 |     prompt = ''
 36 |     for input_text_one in chat_history:
 37 |             prompt += "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>"
 38 |     if chat_history[-1]['role']=='Human':
 39 |         prompt += "<s>Assistant: "
 40 |     else:
 41 |         prompt += "<s>Human: "
 42 |     prompt = prompt[-2048:]
 43 |     if len(system_prompt)>0:
 44 |         prompt = '<s>System: '+system_prompt.strip()+'\n</s>'+prompt
 45 |                 
 46 |     return prompt
 47 | 
 48 | def get_prompt(chat_history, system_prompt=""):
 49 |     B_INST, E_INST = "[INST]", "[/INST]"
 50 |     B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 51 |     
 52 |     sep = " "
 53 |     sep2 =" </s><s>"
 54 |     stop_token_ids = [2]
 55 |     system_template = f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
 56 |     roles = ("[INST]", "[/INST]")
 57 |     seps = [sep, sep2]
 58 |     if system_prompt.strip() != "":
 59 |         ret = system_template
 60 |     else:
 61 |         ret = "[INST] "
 62 |     for i, chat in enumerate(chat_history):
 63 |         message = chat["content"]
 64 |         role = chat["role"]
 65 |         if message:
 66 |             if i == 0:
 67 |                 ret += message + " "
 68 |             else:
 69 |                 if role == "Human":
 70 |                     ret +=  "[INST]" + " " + message + seps[i % 2]
 71 |                 else:
 72 |                     ret +=  "[/INST]" + " " + message + seps[i % 2]
 73 |         else:
 74 |             if role == "Human":
 75 |                 ret += "[INST]"
 76 |             else:
 77 |                 ret += "[/INST]"
 78 |     print("prompt:{}".format(ret))
 79 |     return ret
 80 | 
 81 | def get_prompt_llama3(chat_history, system_prompt=""):
 82 |     system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
 83 |     user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>'
 84 |     assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n'
 85 |     prompt_str = ''
 86 |     # 拼接历史对话
 87 |     for item in chat_history:
 88 |         if item['role']=='Human':
 89 |             prompt_str+=user_format.format(content=item['content'])
 90 |         else:
 91 |             prompt_str+=assistant_format.format(content=item['content'])
 92 |     if len(system_prompt)>0:
 93 |         prompt_str = system_format.format(content=system_prompt) + prompt_str
 94 |     prompt_str = "<|begin_of_text|>" + prompt_str
 95 |     return prompt_str
 96 | 
 97 | 
 98 | @app.post("/generate")
 99 | async def create_item(request: Request):
100 |     global model, tokenizer
101 |     json_post_raw = await request.json()
102 |     json_post = json.dumps(json_post_raw)
103 |     json_post_list = json.loads(json_post)
104 |     history = json_post_list.get('history')
105 |     system_prompt = json_post_list.get('system_prompt')
106 |     max_new_tokens = json_post_list.get('max_new_tokens')
107 |     top_p = json_post_list.get('top_p')
108 |     temperature = json_post_list.get('temperature')
109 |     
110 |     if args.model_source == "llama2_meta":
111 |         prompt = get_prompt(history, system_prompt)
112 |     elif args.model_source == "llama3_meta":
113 |         prompt = get_prompt_llama3(history, system_prompt)
114 |     else:
115 |         prompt = get_prompt_llama2chinese(history, system_prompt)
116 |         
117 |     inputs = tokenizer([prompt], return_tensors='pt').to("cuda")
118 |     generate_kwargs = dict(
119 |         inputs,
120 |         # streamer=streamer,
121 |         max_new_tokens=max_new_tokens,
122 |         do_sample=True,
123 |         top_p=top_p,
124 |         top_k=50,
125 |         temperature=temperature,
126 |         num_beams=1,
127 |         repetition_penalty=1.2,
128 |         max_length=2048,
129 |     )
130 |     generate_ids = model.generate(**generate_kwargs)
131 |     
132 |     generate_ids = [item[len(inputs[0]):-1] for  item in generate_ids]
133 |     
134 |     bot_message = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
135 |     if 'Human:' in bot_message:
136 |         bot_message = bot_message.split('Human:')[0]
137 |     
138 |     now = datetime.datetime.now()
139 |     time = now.strftime("%Y-%m-%d %H:%M:%S")
140 |     answer = {
141 |         "response": bot_message,
142 |         "status": 200,
143 |         "time": time
144 |     }
145 |     return answer
146 | 
147 | def get_world_size() -> int:
148 |     if dist.is_initialized():
149 |         return dist.get_world_size()
150 |     else:
151 |         return 1
152 | 
153 | def print_rank0(*msg):
154 |     if rank != 0:
155 |         return
156 |     print(*msg)
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     dtype = torch.float16
161 |     kwargs = dict(
162 |         device_map="auto",
163 |     )
164 |     print("get_world_size:{}".format(get_world_size()))
165 |     
166 |     infer_dtype = args.infer_dtype
167 |     if infer_dtype not in ["int4", "int8", "float16"]:
168 |         raise ValueError("infer_dtype must one of int4, int8 or float16")
169 |     
170 |     if get_world_size() > 1:
171 |         kwargs["device_map"] = "balanced_low_0"
172 |     
173 |     if infer_dtype == "int8":
174 |         print_rank0("Using `load_in_8bit=True` to use quanitized model")
175 |         kwargs["load_in_8bit"] = True
176 |     else:
177 |         kwargs["torch_dtype"] = dtype
178 |     
179 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
180 |     if infer_dtype in ["int8", "float16"]:
181 |         model = AutoModelForCausalLM.from_pretrained(args.model_path, **kwargs,trust_remote_code=True,use_flash_attention_2=True)
182 |     elif infer_dtype == "int4":
183 |         from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
184 |         model = AutoGPTQForCausalLM.from_quantized(
185 |             args.model_path, device="cuda:0", 
186 |             use_triton=False,
187 |             low_cpu_mem_usage=True,
188 |             # inject_fused_attention=False,
189 |             # inject_fused_mlp=False
190 |             )
191 |     
192 |     model.eval()
193 |     uvicorn.run(app, host='0.0.0.0', port=8001, workers=1)
194 | 


--------------------------------------------------------------------------------
/scripts/convert2hf/README.md:
--------------------------------------------------------------------------------
 1 | ## Meta官网模型权重转换成Hugging Face格式
 2 | 
 3 | 使用脚本
 4 | ```bash
 5 | python convert_llama_weights_to_hf.py \
 6 |     --input_dir /path/to/downloaded/llama/weights \
 7 |     --model_size 7B \
 8 |     --output_dir /output/path
 9 | ```
10 | 
11 | 通过脚本转换后的模型权重可以使用transformers进行加载，例如：
12 | 
13 | ```py
14 | from transformers import LlamaForCausalLM, LlamaTokenizer
15 | 
16 | model = LlamaForCausalLM.from_pretrained("/output/path")
17 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
18 | ```


--------------------------------------------------------------------------------
/scripts/convert2hf/convert_llama_weights_to_hf.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import argparse
 15 | import gc
 16 | import json
 17 | import os
 18 | import shutil
 19 | import warnings
 20 | 
 21 | import torch
 22 | 
 23 | from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 24 | 
 25 | 
 26 | try:
 27 |     from transformers import LlamaTokenizerFast
 28 | except ImportError as e:
 29 |     warnings.warn(e)
 30 |     warnings.warn(
 31 |         "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
 32 |     )
 33 |     LlamaTokenizerFast = None
 34 | 
 35 | """
 36 | Sample usage:
 37 | 
 38 | ```
 39 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \
 40 |     --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
 41 | ```
 42 | 
 43 | Thereafter, models can be loaded via:
 44 | 
 45 | ```py
 46 | from transformers import LlamaForCausalLM, LlamaTokenizer
 47 | 
 48 | model = LlamaForCausalLM.from_pretrained("/output/path")
 49 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
 50 | ```
 51 | 
 52 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
 53 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
 54 | """
 55 | 
 56 | INTERMEDIATE_SIZE_MAP = {
 57 |     "7B": 11008,
 58 |     "13B": 13824,
 59 |     "30B": 17920,
 60 |     "65B": 22016,
 61 |     "70B": 28672,
 62 | }
 63 | NUM_SHARDS = {
 64 |     "7B": 1,
 65 |     "7Bf": 1,
 66 |     "13B": 2,
 67 |     "13Bf": 2,
 68 |     "30B": 4,
 69 |     "65B": 8,
 70 |     "70B": 8,
 71 |     "70Bf": 8,
 72 | }
 73 | 
 74 | 
 75 | def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
 76 |     return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
 77 | 
 78 | 
 79 | def read_json(path):
 80 |     with open(path, "r") as f:
 81 |         return json.load(f)
 82 | 
 83 | 
 84 | def write_json(text, path):
 85 |     with open(path, "w") as f:
 86 |         json.dump(text, f)
 87 | 
 88 | 
 89 | def write_model(model_path, input_base_path, model_size, safe_serialization=True):
 90 |     os.makedirs(model_path, exist_ok=True)
 91 |     tmp_model_path = os.path.join(model_path, "tmp")
 92 |     os.makedirs(tmp_model_path, exist_ok=True)
 93 | 
 94 |     params = read_json(os.path.join(input_base_path, "params.json"))
 95 |     num_shards = NUM_SHARDS[model_size]
 96 |     n_layers = params["n_layers"]
 97 |     n_heads = params["n_heads"]
 98 |     n_heads_per_shard = n_heads // num_shards
 99 |     dim = params["dim"]
100 |     dims_per_head = dim // n_heads
101 |     base = 10000.0
102 |     inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
103 | 
104 |     if "n_kv_heads" in params:
105 |         num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
106 |         num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
107 |         key_value_dim = dim // num_key_value_heads
108 |     else:  # compatibility with other checkpoints
109 |         num_key_value_heads = n_heads
110 |         num_local_key_value_heads = n_heads_per_shard
111 |         key_value_dim = dim
112 | 
113 |     # permute for sliced rotary
114 |     def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
115 |         return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
116 | 
117 |     print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
118 |     # Load weights
119 |     if model_size == "7B":
120 |         # Not sharded
121 |         # (The sharded implementation would also work, but this is simpler.)
122 |         loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
123 |     else:
124 |         # Sharded
125 |         loaded = [
126 |             torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
127 |             for i in range(num_shards)
128 |         ]
129 |     param_count = 0
130 |     index_dict = {"weight_map": {}}
131 |     for layer_i in range(n_layers):
132 |         filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
133 |         if model_size == "7B":
134 |             # Unsharded
135 |             state_dict = {
136 |                 f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
137 |                     loaded[f"layers.{layer_i}.attention.wq.weight"]
138 |                 ),
139 |                 f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
140 |                     loaded[f"layers.{layer_i}.attention.wk.weight"]
141 |                 ),
142 |                 f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
143 |                 f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
144 |                 f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
145 |                 f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
146 |                 f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
147 |                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
148 |                 f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
149 |             }
150 |         else:
151 |             # Sharded
152 |             # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
153 |             # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
154 |             # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
155 | 
156 |             state_dict = {
157 |                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
158 |                     f"layers.{layer_i}.attention_norm.weight"
159 |                 ].clone(),
160 |                 f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
161 |                     f"layers.{layer_i}.ffn_norm.weight"
162 |                 ].clone(),
163 |             }
164 |             state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
165 |                 torch.cat(
166 |                     [
167 |                         loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
168 |                         for i in range(num_shards)
169 |                     ],
170 |                     dim=0,
171 |                 ).reshape(dim, dim)
172 |             )
173 |             state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
174 |                 torch.cat(
175 |                     [
176 |                         loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
177 |                             num_local_key_value_heads, dims_per_head, dim
178 |                         )
179 |                         for i in range(num_shards)
180 |                     ],
181 |                     dim=0,
182 |                 ).reshape(key_value_dim, dim),
183 |                 num_key_value_heads,
184 |                 key_value_dim,
185 |                 dim,
186 |             )
187 |             state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
188 |                 [
189 |                     loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
190 |                         num_local_key_value_heads, dims_per_head, dim
191 |                     )
192 |                     for i in range(num_shards)
193 |                 ],
194 |                 dim=0,
195 |             ).reshape(key_value_dim, dim)
196 | 
197 |             state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
198 |                 [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
199 |             )
200 |             state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
201 |                 [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
202 |             )
203 |             state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
204 |                 [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
205 |             )
206 |             state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
207 |                 [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
208 |             )
209 | 
210 |         state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
211 |         for k, v in state_dict.items():
212 |             index_dict["weight_map"][k] = filename
213 |             param_count += v.numel()
214 |         torch.save(state_dict, os.path.join(tmp_model_path, filename))
215 | 
216 |     filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
217 |     if model_size == "7B":
218 |         # Unsharded
219 |         state_dict = {
220 |             "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
221 |             "model.norm.weight": loaded["norm.weight"],
222 |             "lm_head.weight": loaded["output.weight"],
223 |         }
224 |     else:
225 |         state_dict = {
226 |             "model.norm.weight": loaded[0]["norm.weight"],
227 |             "model.embed_tokens.weight": torch.cat(
228 |                 [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
229 |             ),
230 |             "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
231 |         }
232 | 
233 |     for k, v in state_dict.items():
234 |         index_dict["weight_map"][k] = filename
235 |         param_count += v.numel()
236 |     torch.save(state_dict, os.path.join(tmp_model_path, filename))
237 | 
238 |     # Write configs
239 |     index_dict["metadata"] = {"total_size": param_count * 2}
240 |     write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
241 |     ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
242 |     multiple_of = params["multiple_of"] if "multiple_of" in params else 256
243 |     config = LlamaConfig(
244 |         hidden_size=dim,
245 |         intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
246 |         num_attention_heads=params["n_heads"],
247 |         num_hidden_layers=params["n_layers"],
248 |         rms_norm_eps=params["norm_eps"],
249 |         num_key_value_heads=num_key_value_heads,
250 |     )
251 |     config.save_pretrained(tmp_model_path)
252 | 
253 |     # Make space so we can load the model properly now.
254 |     del state_dict
255 |     del loaded
256 |     gc.collect()
257 | 
258 |     print("Loading the checkpoint in a Llama model.")
259 |     model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
260 |     # Avoid saving this as part of the config.
261 |     del model.config._name_or_path
262 | 
263 |     print("Saving in the Transformers format.")
264 |     model.save_pretrained(model_path, safe_serialization=safe_serialization)
265 |     shutil.rmtree(tmp_model_path)
266 | 
267 | 
268 | def write_tokenizer(tokenizer_path, input_tokenizer_path):
269 |     # Initialize the tokenizer based on the `spm` model
270 |     tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
271 |     print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
272 |     tokenizer = tokenizer_class(input_tokenizer_path)
273 |     tokenizer.save_pretrained(tokenizer_path)
274 | 
275 | 
276 | def main():
277 |     parser = argparse.ArgumentParser()
278 |     parser.add_argument(
279 |         "--input_dir",
280 |         help="Location of LLaMA weights, which contains tokenizer.model and model folders",
281 |     )
282 |     parser.add_argument(
283 |         "--model_size",
284 |         choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"],
285 |         help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
286 |     )
287 |     parser.add_argument(
288 |         "--output_dir",
289 |         help="Location to write HF model and tokenizer",
290 |     )
291 |     parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
292 |     args = parser.parse_args()
293 |     if args.model_size != "tokenizer_only":
294 |         write_model(
295 |             model_path=args.output_dir,
296 |             # input_base_path=os.path.join(args.input_dir, args.model_size),
297 |             input_base_path=args.input_dir,
298 |             model_size=args.model_size,
299 |             safe_serialization=args.safe_serialization,
300 |         )
301 |     spm_path = os.path.join(args.input_dir, "tokenizer.model")
302 |     write_tokenizer(args.output_dir, spm_path)
303 | 
304 | 
305 | if __name__ == "__main__":
306 |     main()
307 | 
308 | 


--------------------------------------------------------------------------------
/scripts/test_model/test_pretrain_model.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import os\n",
10 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
11 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
12 |     "from transformers import AutoTokenizer,AutoModelForCausalLM\n",
13 |     "import torch\n",
14 |     "model = AutoModelForCausalLM.from_pretrained('/mnt/nvme3n1/model_public/Atom1B/checkpoint-480000',torch_dtype=torch.float16,device_map='auto',trust_remote_code=True)\n",
15 |     "tokenizer = AutoTokenizer.from_pretrained('/mnt/nvme3n1/model_public/Atom1B/checkpoint-480000',use_fast=False)"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": null,
21 |    "metadata": {},
22 |    "outputs": [],
23 |    "source": [
24 |     "input_ids = tokenizer(['''<s>Human: 介绍一下北京\\n</s><s>Assistant: '''], return_tensors=\"pt\",add_special_tokens=False).input_ids.to('cuda') \n",
25 |     "print(input_ids) "
26 |    ]
27 |   },
28 |   {
29 |    "cell_type": "code",
30 |    "execution_count": null,
31 |    "metadata": {},
32 |    "outputs": [],
33 |    "source": [
34 |     "generate_input = {\n",
35 |     "    \"input_ids\":input_ids,\n",
36 |     "    \"max_new_tokens\":10,\n",
37 |     "    \"do_sample\":True,\n",
38 |     "    \"top_k\":50,\n",
39 |     "    \"top_p\":0.95,\n",
40 |     "    \"temperature\":1,\n",
41 |     "    \"repetition_penalty\":1.0,\n",
42 |     "    \"eos_token_id\":tokenizer.eos_token_id,\n",
43 |     "    \"bos_token_id\":tokenizer.bos_token_id,\n",
44 |     "    \"pad_token_id\":tokenizer.pad_token_id\n",
45 |     "}\n",
46 |     "generate_ids  = model.generate(**generate_input)\n",
47 |     "text = tokenizer.decode(generate_ids[0])\n",
48 |     "print(text)"
49 |    ]
50 |   },
51 |   {
52 |    "cell_type": "markdown",
53 |    "metadata": {},
54 |    "source": [
55 |     "# checkpoint-100 的模型输出\n",
56 |     "\n",
57 |     "# checkpoint-5000 的模型输出\n"
58 |    ]
59 |   }
60 |  ],
61 |  "metadata": {
62 |   "language_info": {
63 |    "name": "python"
64 |   }
65 |  },
66 |  "nbformat": 4,
67 |  "nbformat_minor": 2
68 | }
69 | 


--------------------------------------------------------------------------------
/train/merge_peft_model/merge.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python merge_peft_adapter.py \
2 |     --adapter_model_name /checkpoint-2200 \
3 |     --output_name checkpoint-2200_merge \
4 |     --load8bit false \
5 |     --tokenizer_fast false  


--------------------------------------------------------------------------------
/train/merge_peft_model/merge_muilt.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=3 python merge_muilt_peft_adapter.py \
2 |     --adapter_model_name checkpoint-8000 \
3 |                     checkpoint-140 \
4 |     --output_name checkpoint-140-8000_merge 


--------------------------------------------------------------------------------
/train/merge_peft_model/merge_muilt_peft_adapter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional,List
 3 | 
 4 | import peft
 5 | import torch
 6 | from peft import PeftConfig, PeftModel,PeftModelForSequenceClassification
 7 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser,AutoModelForSequenceClassification
 8 | from peft.utils import _get_submodules
 9 | 
10 | @dataclass
11 | class ScriptArguments:
12 |     """
13 |     The name of the Casual LM model we wish to fine with PPO
14 |     """
15 | 
16 |     adapter_model_name: Optional[List[str]] = field(default=None, metadata={"help": "the model name"})
17 |     output_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
18 | 
19 | 
20 | parser = HfArgumentParser(ScriptArguments)
21 | script_args = parser.parse_args_into_dataclasses()[0]
22 | 
23 | base_model = None
24 | for one_lora_path in script_args.adapter_model_name:
25 |     if base_model==None:
26 |         peft_config = PeftConfig.from_pretrained(one_lora_path)
27 |         tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
28 |         tokenizer.save_pretrained(f"{script_args.output_name}")
29 |         base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, torch_dtype=torch.bfloat16)
30 |     peft_config = PeftConfig.from_pretrained(one_lora_path)
31 |     base_model = PeftModel.from_pretrained(base_model, one_lora_path,device_map={"": 0})
32 |     # model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, device_map='auto',load_in_8bit=True)
33 |     # Load the Lora model
34 |     base_model = base_model.merge_and_unload()
35 |     base_model.eval()
36 | 
37 | 
38 | 
39 | 
40 | # key_list = [key for key, _ in model.base_model.model.named_modules() if "lora" not in key]
41 | # for key in key_list:
42 | #     print(key)
43 | #     parent, target, target_name = _get_submodules(model.base_model,key)
44 | #     if isinstance(target, peft.tuners.lora.Linear):
45 | #         print('peft.tuners.lora.Linear')
46 | #         bias = target.bias is not None
47 | #         new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
48 | #         model.base_model._replace_module(parent, target_name, new_module, target)
49 | 
50 | # model = model.base_model.model
51 | 
52 | 
53 | base_model.save_pretrained(f"{script_args.output_name}")
54 | # model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)


--------------------------------------------------------------------------------
/train/merge_peft_model/merge_peft_adapter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | import peft
 5 | import torch
 6 | from peft import PeftConfig, PeftModel,PeftModelForSequenceClassification
 7 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser,AutoModelForSequenceClassification
 8 | from peft.utils import _get_submodules
 9 | 
10 | @dataclass
11 | class ScriptArguments:
12 |     """
13 |     The name of the Casual LM model we wish to fine with PPO
14 |     """
15 | 
16 |     adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
17 |     load8bit : Optional[bool] = field(default=None, metadata={"help": "the model type"})
18 |     output_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
19 |     tokenizer_fast:Optional[bool] = field(default=None, metadata={"help": "the model type"})
20 | 
21 | 
22 | parser = HfArgumentParser(ScriptArguments)
23 | script_args = parser.parse_args_into_dataclasses()[0]
24 | 
25 | 
26 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
27 | model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16,device_map='auto',trust_remote_code=True)
28 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name,device_map='auto')
29 | tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path,use_fast=script_args.tokenizer_fast)
30 | config = AutoConfig.from_pretrained(peft_config.base_model_name_or_path)
31 | architecture = config.architectures[0]
32 | print(architecture)
33 | # Load the Lora model
34 | model = model.merge_and_unload()
35 | model.eval()
36 | 
37 | 
38 | model.save_pretrained(f"{script_args.output_name}")
39 | tokenizer.save_pretrained(f"{script_args.output_name}")
40 | if script_args.load8bit:
41 |     model = AutoModelForCausalLM.from_pretrained(script_args.output_name, torch_dtype=torch.float16,load_in_8bit=script_args.load8bit,device_map='auto',trust_remote_code=True)
42 |     model.save_pretrained(f"{script_args.output_name}",max_shard_size='5GB')


--------------------------------------------------------------------------------
/train/pretrain/accuracy.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Accuracy metric."""
 15 | 
 16 | import datasets
 17 | from sklearn.metrics import accuracy_score
 18 | 
 19 | import evaluate
 20 | 
 21 | 
 22 | _DESCRIPTION = """
 23 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
 24 | Accuracy = (TP + TN) / (TP + TN + FP + FN)
 25 |  Where:
 26 | TP: True positive
 27 | TN: True negative
 28 | FP: False positive
 29 | FN: False negative
 30 | """
 31 | 
 32 | 
 33 | _KWARGS_DESCRIPTION = """
 34 | Args:
 35 |     predictions (`list` of `int`): Predicted labels.
 36 |     references (`list` of `int`): Ground truth labels.
 37 |     normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
 38 |     sample_weight (`list` of `float`): Sample weights Defaults to None.
 39 | 
 40 | Returns:
 41 |     accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
 42 | 
 43 | Examples:
 44 | 
 45 |     Example 1-A simple example
 46 |         >>> accuracy_metric = evaluate.load("accuracy")
 47 |         >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
 48 |         >>> print(results)
 49 |         {'accuracy': 0.5}
 50 | 
 51 |     Example 2-The same as Example 1, except with `normalize` set to `False`.
 52 |         >>> accuracy_metric = evaluate.load("accuracy")
 53 |         >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
 54 |         >>> print(results)
 55 |         {'accuracy': 3.0}
 56 | 
 57 |     Example 3-The same as Example 1, except with `sample_weight` set.
 58 |         >>> accuracy_metric = evaluate.load("accuracy")
 59 |         >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
 60 |         >>> print(results)
 61 |         {'accuracy': 0.8778625954198473}
 62 | """
 63 | 
 64 | 
 65 | _CITATION = """
 66 | @article{scikit-learn,
 67 |   title={Scikit-learn: Machine Learning in {P}ython},
 68 |   author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
 69 |          and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
 70 |          and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
 71 |          Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 72 |   journal={Journal of Machine Learning Research},
 73 |   volume={12},
 74 |   pages={2825--2830},
 75 |   year={2011}
 76 | }
 77 | """
 78 | 
 79 | 
 80 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 81 | class Accuracy(evaluate.Metric):
 82 |     def _info(self):
 83 |         return evaluate.MetricInfo(
 84 |             description=_DESCRIPTION,
 85 |             citation=_CITATION,
 86 |             inputs_description=_KWARGS_DESCRIPTION,
 87 |             features=datasets.Features(
 88 |                 {
 89 |                     "predictions": datasets.Sequence(datasets.Value("int32")),
 90 |                     "references": datasets.Sequence(datasets.Value("int32")),
 91 |                 }
 92 |                 if self.config_name == "multilabel"
 93 |                 else {
 94 |                     "predictions": datasets.Value("int32"),
 95 |                     "references": datasets.Value("int32"),
 96 |                 }
 97 |             ),
 98 |             reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
 99 |         )
100 | 
101 |     def _compute(self, predictions, references, normalize=True, sample_weight=None):
102 |         return {
103 |             "accuracy": float(
104 |                 accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
105 |             )
106 |         }
107 | 


--------------------------------------------------------------------------------
/train/pretrain/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "optimizer": {
11 |         "type": "AdamW",
12 |         "params": {
13 |             "lr": "auto",
14 |             "betas": "auto",
15 |             "eps": "auto",
16 |             "weight_decay": "auto"
17 |         }
18 |     },
19 | 
20 |     "scheduler": {
21 |         "type": "WarmupDecayLR",
22 |         "params": {
23 |             "last_batch_iteration": -1,
24 |             "total_num_steps": "auto",
25 |             "warmup_min_lr": "auto",
26 |             "warmup_max_lr": "auto",
27 |             "warmup_num_steps": "auto"
28 |         }
29 |     },
30 | 
31 |     "zero_optimization": {
32 |         "stage": 2,
33 |         "offload_optimizer": {
34 |             "device": "cpu",
35 |             "pin_memory": true
36 |         },
37 |         "offload_param": {
38 |             "device": "cpu",
39 |             "pin_memory": true
40 |         },
41 |         "allgather_partitions": true,
42 |         "allgather_bucket_size": 5e8,
43 |         "overlap_comm": true,
44 |         "reduce_scatter": true,
45 |         "reduce_bucket_size": 5e8,
46 |         "contiguous_gradients": true
47 |     },
48 |     "activation_checkpointing": {
49 |         "partition_activations": false,
50 |         "cpu_checkpointing": false,
51 |         "contiguous_memory_optimization": false,
52 |         "number_checkpoints": null,
53 |         "synchronize_checkpoint_boundary": false,
54 |         "profile": false
55 |     },
56 |     "gradient_accumulation_steps": "auto",
57 |     "gradient_clipping": "auto",
58 |     "steps_per_print": 2000,
59 |     "train_batch_size": "auto",
60 |     "min_lr": 5e-7,
61 |     "train_micro_batch_size_per_gpu": "auto",
62 |     "wall_clock_breakdown": false
63 | }


--------------------------------------------------------------------------------
/train/pretrain/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1,
 9 |         "fp16_opt_level": "O2"
10 |     },
11 | 
12 |     "bf16": {
13 |         "enabled": "auto"
14 |     }, 
15 | 
16 |     "optimizer": {
17 |         "type": "AdamW",
18 |         "params": {
19 |             "lr": "auto",
20 |             "betas": "auto",
21 |             "eps": "auto",
22 |             "weight_decay": "auto"
23 |         }
24 |     },
25 | 
26 |     "scheduler": {
27 |         "type": "WarmupDecayLR",
28 |         "params": {
29 |             "last_batch_iteration": -1,
30 |             "total_num_steps": "auto",
31 |             "warmup_min_lr": "auto",
32 |             "warmup_max_lr": "auto",
33 |             "warmup_num_steps": "auto"
34 |         }
35 |     },
36 | 
37 |     "zero_optimization": {
38 |         "stage": 3,
39 |         "overlap_comm": true,
40 |         "contiguous_gradients": true,
41 |         "sub_group_size": 1e9,
42 |         "reduce_bucket_size": "auto",
43 |         "stage3_prefetch_bucket_size": "auto",
44 |         "stage3_param_persistence_threshold": "auto",
45 |         "stage3_max_live_parameters": 1e9,
46 |         "stage3_max_reuse_distance": 1e9,
47 |         "gather_16bit_weights_on_model_save": true
48 |     },
49 |     "gradient_accumulation_steps": "auto",
50 |     "gradient_clipping": "auto",
51 |     "steps_per_print": 2000,
52 |     "train_batch_size": "auto",
53 |     "train_micro_batch_size_per_gpu": "auto",
54 |     "wall_clock_breakdown": false
55 | }


--------------------------------------------------------------------------------
/train/pretrain/pretrain.sh:
--------------------------------------------------------------------------------
 1 | output_model=output_model
 2 | if [ ! -d ${output_model} ];then  
 3 |     mkdir ${output_model}
 4 | fi
 5 | cp ./pretrain.sh ${output_model}
 6 | cp ./ds_config_zero*.json ${output_model}
 7 | export CUDA_HOME=/usr/local/cuda/
 8 | export NCCL_P2P_DISABLE=1
 9 | 
10 | deepspeed --include localhost:0,2 pretrain_clm.py \
11 |     --config_name  ../../model_config/Atom-100M/config.json \
12 |     --tokenizer_name ../../model_config/Atom-100M \
13 |     --train_files ../../data/wiki_zh/train_lm_task_0.csv \
14 |                     ../../data/wiki_zh/train_lm_task_1.csv \
15 |     --validation_files  ../../data/wiki_zh/dev_lm_task.csv \
16 |     --per_device_train_batch_size 32 \
17 |     --per_device_eval_batch_size 32 \
18 |     --do_train \
19 |     --output_dir ${output_model} \
20 |     --evaluation_strategy  steps \
21 |     --use_fast_tokenizer false \
22 |     --max_eval_samples 500 \
23 |     --learning_rate 1e-4 \
24 |     --gradient_accumulation_steps 2 \
25 |     --num_train_epochs 3 \
26 |     --warmup_steps 5000 \
27 |     --logging_dir ${output_model}/logs \
28 |     --logging_strategy steps \
29 |     --logging_steps 5 \
30 |     --save_strategy steps \
31 |     --preprocessing_num_workers 10 \
32 |     --save_steps 100 \
33 |     --eval_steps 5000000 \
34 |     --save_total_limit 2000 \
35 |     --seed 42 \
36 |     --disable_tqdm false \
37 |     --ddp_find_unused_parameters false \
38 |     --block_size 1024 \
39 |     --overwrite_output_dir \
40 |     --report_to tensorboard \
41 |     --run_name ${output_model} \
42 |     --bf16 \
43 |     --bf16_full_eval \
44 |     --gradient_checkpointing \
45 |     --deepspeed ./ds_config_zero3.json \
46 |     --ignore_data_skip true \
47 |     --ddp_timeout 18000000 \
48 |     | tee -a ${output_model}/train.log
49 |     
50 |     # --resume_from_checkpoint ${output_model}/checkpoint-20400 \
51 | 


--------------------------------------------------------------------------------
/train/pretrain/pretrain_clm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
 18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 19 | https://huggingface.co/models?filter=text-generation
 20 | """
 21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 22 | 
 23 | import logging
 24 | import math
 25 | import os
 26 | import sys
 27 | from dataclasses import dataclass, field
 28 | from torchdata.datapipes.iter import IterDataPipe, IterableWrapper
 29 | from itertools import chain
 30 | import deepspeed
 31 | from typing import Optional,List
 32 | 
 33 | import datasets
 34 | import pandas as pd
 35 | import evaluate
 36 | import torch
 37 | from datasets import load_dataset
 38 | from datasets.combine import interleave_datasets
 39 | import transformers
 40 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 41 | from transformers import (
 42 |     CONFIG_MAPPING,
 43 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 44 |     AutoConfig,
 45 |     AutoModelForCausalLM,
 46 |     AutoTokenizer,
 47 |     TrainerCallback,
 48 |     TrainerState,
 49 |     TrainerControl,
 50 |     HfArgumentParser,
 51 |     Trainer,
 52 |     TrainingArguments,
 53 |     default_data_collator,
 54 |     is_torch_tpu_available,
 55 |     set_seed,
 56 | )
 57 | import datetime
 58 | from transformers.testing_utils import CaptureLogger
 59 | from transformers.trainer_utils import get_last_checkpoint
 60 | from transformers.utils import check_min_version, send_example_telemetry
 61 | from transformers.utils.versions import require_version
 62 | from datasets import interleave_datasets
 63 | 
 64 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 65 | # check_min_version("4.27.0.dev0")
 66 | 
 67 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 68 | 
 69 | logger = logging.getLogger(__name__)
 70 | 
 71 | 
 72 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 73 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 74 | 
 75 | @dataclass
 76 | class ModelArguments:
 77 |     """
 78 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 79 |     """
 80 | 
 81 |     model_name_or_path: Optional[str] = field(
 82 |         default=None,
 83 |         metadata={
 84 |             "help": (
 85 |                 "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
 86 |             )
 87 |         },
 88 |     )
 89 |     model_type: Optional[str] = field(
 90 |         default=None,
 91 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 92 |     )
 93 |     config_overrides: Optional[str] = field(
 94 |         default=None,
 95 |         metadata={
 96 |             "help": (
 97 |                 "Override some existing default config settings when a model is trained from scratch. Example: "
 98 |                 "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
 99 |             )
100 |         },
101 |     )
102 |     config_name: Optional[str] = field(
103 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
104 |     )
105 |     tokenizer_name: Optional[str] = field(
106 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
107 |     )
108 |     cache_dir: Optional[str] = field(
109 |         default=None,
110 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
111 |     )
112 |     use_fast_tokenizer: bool = field(
113 |         default=True,
114 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
115 |     )
116 |     model_revision: str = field(
117 |         default="main",
118 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
119 |     )
120 |     use_auth_token: bool = field(
121 |         default=False,
122 |         metadata={
123 |             "help": (
124 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
125 |                 "with private models)."
126 |             )
127 |         },
128 |     )
129 |     torch_dtype: Optional[str] = field(
130 |         default=None,
131 |         metadata={
132 |             "help": (
133 |                 "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
134 |                 "dtype will be automatically derived from the model's weights."
135 |             ),
136 |             "choices": ["auto", "bfloat16", "float16", "float32"],
137 |         },
138 |     )
139 | 
140 |     def __post_init__(self):
141 |         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
142 |             raise ValueError(
143 |                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
144 |             )
145 | 
146 | 
147 | @dataclass
148 | class DataTrainingArguments:
149 |     """
150 |     Arguments pertaining to what data we are going to input our model for training and eval.
151 |     """
152 | 
153 |     dataset_name: Optional[str] = field(
154 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
155 |     )
156 |     dataset_config_name: Optional[str] = field(
157 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
158 |     )
159 |     train_files: Optional[List[str]]  = field(default=None, metadata={"help": "The input training data file (a text file)."})
160 |     validation_files: Optional[List[str]]  = field(
161 |         default=None,
162 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
163 |     )
164 |     max_train_samples: Optional[int] = field(
165 |         default=None,
166 |         metadata={
167 |             "help": (
168 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
169 |                 "value if set."
170 |             )
171 |         },
172 |     )
173 |     max_eval_samples: Optional[int] = field(
174 |         default=None,
175 |         metadata={
176 |             "help": (
177 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
178 |                 "value if set."
179 |             )
180 |         },
181 |     )
182 |     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
183 |     block_size: Optional[int] = field(
184 |         default=None,
185 |         metadata={
186 |             "help": (
187 |                 "Optional input sequence length after tokenization. "
188 |                 "The training dataset will be truncated in block of this size for training. "
189 |                 "Default to the model max input length for single sentence inputs (take into account special tokens)."
190 |             )
191 |         },
192 |     )
193 |     overwrite_cache: bool = field(
194 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
195 |     )
196 |     validation_split_percentage: Optional[int] = field(
197 |         default=5,
198 |         metadata={
199 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
200 |         },
201 |     )
202 |     preprocessing_num_workers: Optional[int] = field(
203 |         default=None,
204 |         metadata={"help": "The number of processes to use for the preprocessing."},
205 |     )
206 |     keep_linebreaks: bool = field(
207 |         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
208 |     )
209 | 
210 |     def __post_init__(self):
211 |         if self.streaming:
212 |             require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
213 | 
214 |         if self.dataset_name is None and self.train_files is None and self.validation_files is None:
215 |             raise ValueError("Need either a dataset name or a training/validation file.")
216 |         else:
217 |             if self.train_files is not None:
218 |                 extension = self.train_files[0].split(".")[-1]
219 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
220 |             if self.validation_files is not None:
221 |                 extension = self.validation_files[0].split(".")[-1]
222 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
223 |                 
224 | def main():
225 |     # See all possible arguments in src/transformers/training_args.py
226 |     # or by passing the --help flag to this script.
227 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
228 | 
229 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
230 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
231 |         # If we pass only one argument to the script and it's the path to a json file,
232 |         # let's parse it to get our arguments.
233 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
234 |     else:
235 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
236 | 
237 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
238 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
239 |     send_example_telemetry("run_clm", model_args, data_args)
240 | 
241 |     # Setup logging
242 |     logging.basicConfig(
243 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
244 |         datefmt="%m/%d/%Y %H:%M:%S",
245 |         handlers=[logging.StreamHandler(sys.stdout)],
246 |     )
247 | 
248 |     if training_args.should_log:
249 |         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
250 |         transformers.utils.logging.set_verbosity_info()
251 | 
252 |     log_level = training_args.get_process_log_level()
253 |     logger.setLevel(log_level)
254 |     datasets.utils.logging.set_verbosity(log_level)
255 |     transformers.utils.logging.set_verbosity(log_level)
256 |     transformers.utils.logging.enable_default_handler()
257 |     transformers.utils.logging.enable_explicit_format()
258 | 
259 |     # Log on each process the small summary:
260 |     logger.warning(
261 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
262 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
263 |     )
264 |     logger.info(f"Training/evaluation parameters {training_args}")
265 | 
266 |     # Detecting last checkpoint.
267 |     last_checkpoint = None
268 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
269 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
270 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
271 |             raise ValueError(
272 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
273 |                 "Use --overwrite_output_dir to overcome."
274 |             )
275 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
276 |             logger.info(
277 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
278 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
279 |             )
280 | 
281 |     # Set seed before initializing model.
282 |     set_seed(training_args.seed)
283 | 
284 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
285 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
286 |     # (the dataset will be downloaded automatically from the datasets Hub).
287 |     #
288 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
289 |     # 'text' is found. You can easily tweak this behavior (see below).
290 |     #
291 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
292 |     # download the dataset.
293 |     if True:
294 |         data_files = {}
295 |         dataset_args = {}
296 |         if data_args.train_files is not None:
297 |             
298 |             print(data_args.train_files)
299 |             data_files["train"] = data_args.train_files
300 |             print('训练文件总个数',len(data_args.train_files))
301 |         if data_args.validation_files is not None:
302 |             data_files["validation"] = data_args.validation_files
303 |         extension = (
304 |             data_files["train"][0].split(".")[-1]
305 |             if data_files["train"] is not None
306 |             else data_args.validation_files.split(".")[-1]
307 |         )
308 |         if extension == "txt":
309 |             extension = "text"
310 |             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
311 |         
312 |         
313 |         raw_datasets = load_dataset(
314 |             extension,
315 |             data_files=data_files,
316 |             streaming=data_args.streaming,
317 |             cache_dir=os.path.join(training_args.output_dir,'dataset_cache'),
318 |             use_auth_token=True if model_args.use_auth_token else None,
319 |             **dataset_args,
320 |         )
321 |         if data_args.streaming:
322 |             raw_datasets = raw_datasets.shuffle(seed=training_args.seed, buffer_size=1000000)
323 |         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
324 |         if "validation" not in raw_datasets.keys():
325 |             raw_datasets["validation"] = load_dataset(
326 |                 extension,
327 |                 data_files=data_files,
328 |                 split=f"train[:{data_args.validation_split_percentage}%]",
329 |                 cache_dir=model_args.cache_dir,
330 |                 use_auth_token=True if model_args.use_auth_token else None,
331 |                 **dataset_args,
332 |             )
333 |             raw_datasets["train"] = load_dataset(
334 |                 extension,
335 |                 data_files=data_files,
336 |                 split=f"train[{data_args.validation_split_percentage}%:]",
337 |                 cache_dir=model_args.cache_dir,
338 |                 use_auth_token=True if model_args.use_auth_token else None,
339 |                 **dataset_args,
340 |             )
341 | 
342 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
343 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
344 | 
345 |     # Load pretrained model and tokenizer
346 |     #
347 |     # Distributed training:
348 |     # The .from_pretrained methods guarantee that only one local process can concurrently
349 |     # download model & vocab.
350 | 
351 |     config_kwargs = {
352 |         "cache_dir": model_args.cache_dir,
353 |         "revision": model_args.model_revision,
354 |         "use_auth_token": True if model_args.use_auth_token else None,
355 |     }
356 |     if model_args.config_name:
357 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
358 |     elif model_args.model_name_or_path:
359 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
360 |     else:
361 |         config = CONFIG_MAPPING[model_args.model_type]()
362 |         logger.warning("You are instantiating a new config instance from scratch.")
363 |         if model_args.config_overrides is not None:
364 |             logger.info(f"Overriding config: {model_args.config_overrides}")
365 |             config.update_from_string(model_args.config_overrides)
366 |             logger.info(f"New config: {config}")
367 | 
368 |     print(training_args.local_rank,'start load tokenizer')
369 |     tokenizer_kwargs = {
370 |         "cache_dir": model_args.cache_dir,
371 |         "use_fast": model_args.use_fast_tokenizer,
372 |         "revision": model_args.model_revision,
373 |         "use_auth_token": True if model_args.use_auth_token else None,
374 |     }
375 |     if model_args.tokenizer_name:
376 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
377 |     elif model_args.model_name_or_path:
378 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
379 |     else:
380 |         raise ValueError(
381 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
382 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
383 |         )
384 |     print(training_args.local_rank,'end load tokenizer')
385 |     print(training_args.local_rank,'start load model')
386 |     if model_args.model_name_or_path:
387 |         torch_dtype = (
388 |             model_args.torch_dtype
389 |             if model_args.torch_dtype in ["auto", None]
390 |             else getattr(torch, model_args.torch_dtype)
391 |         )
392 |         model = AutoModelForCausalLM.from_pretrained(
393 |             model_args.model_name_or_path,
394 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
395 |             config=config,
396 |             cache_dir=model_args.cache_dir,
397 |             revision=model_args.model_revision,
398 |             trust_remote_code=True,
399 |             use_flash_attention_2=True,
400 |             use_auth_token=True if model_args.use_auth_token else None,
401 |         )
402 |     else:
403 |         model = AutoModelForCausalLM.from_config(config,trust_remote_code=True)
404 |         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
405 |         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
406 |     print(training_args.local_rank,'end load model')
407 |     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
408 |     # on a small vocab and want a smaller embedding size, remove this test.
409 |     embedding_size = model.get_input_embeddings().weight.shape[0]
410 |     if len(tokenizer) > embedding_size:
411 |         model.resize_token_embeddings(len(tokenizer))
412 |     # Preprocessing the datasets.
413 |     # First we tokenize all the texts.
414 |     if training_args.do_train:
415 |         if data_args.streaming:
416 |             dataset_head = raw_datasets["train"].take(3)
417 |             print(list(dataset_head))
418 |             column_names = list(list(dataset_head)[0].keys())
419 |         else:
420 |             column_names = list(raw_datasets["train"].features)
421 |     else:
422 |         if data_args.streaming:
423 |             dataset_head = raw_datasets["validation"].take(3)
424 |             column_names = list(list(dataset_head)[0].keys())
425 |         else:
426 |             column_names = list(raw_datasets["validation"].features)
427 |     print(column_names)
428 |     text_column_name = "text" if "text" in column_names else column_names[0]
429 | 
430 |     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
431 |     tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
432 | 
433 |     def tokenize_function(examples):
434 |         with CaptureLogger(tok_logger) as cl:
435 |             output = tokenizer( [ item for item in examples[text_column_name]])
436 |         return output
437 | 
438 |     with training_args.main_process_first(desc="dataset map tokenization"):
439 |         if not data_args.streaming:
440 |             tokenized_datasets = raw_datasets.map(
441 |                 tokenize_function,
442 |                 batched=True,
443 |                 num_proc=data_args.preprocessing_num_workers,
444 |                 remove_columns=column_names,
445 |                 load_from_cache_file=not data_args.overwrite_cache,
446 |                 desc="Running tokenizer on dataset",
447 |             )
448 |         else:
449 |             tokenized_datasets = raw_datasets.map(
450 |                 tokenize_function,
451 |                 batched=True,
452 |                 remove_columns=column_names,
453 |                 batch_size = 60000,
454 |             )
455 | 
456 |     if data_args.block_size is None:
457 |         block_size = tokenizer.model_max_length
458 |         if block_size > 1024:
459 |             logger.warning(
460 |                 "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
461 |                 " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
462 |                 " override this default with `--block_size xxx`."
463 |             )
464 |             block_size = 1024
465 |     else:
466 |         if data_args.block_size > tokenizer.model_max_length:
467 |             logger.warning(
468 |                 f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
469 |                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
470 |             )
471 |         block_size = min(data_args.block_size, tokenizer.model_max_length)
472 | 
473 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
474 |     def group_texts(examples):
475 |         # Concatenate all texts.
476 |         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
477 |         # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
478 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
479 |         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
480 |         # customize this part to your needs.
481 |         if total_length >= block_size:
482 |             total_length = (total_length // block_size) * block_size
483 |         # Split by chunks of max_len.
484 |         result = {
485 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
486 |             for k, t in concatenated_examples.items()
487 |         }
488 |         # print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))       
489 |         logger.info("group texts input examples length%d after_group size%d"%(len(examples['input_ids']),len(result["input_ids"])))
490 |         result["labels"] = result["input_ids"].copy()
491 |         return result
492 | 
493 |     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
494 |     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
495 |     # to preprocess.
496 |     #
497 |     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
498 |     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
499 | 
500 |     with training_args.main_process_first(desc="grouping texts together"):
501 |         if not data_args.streaming:
502 |             lm_datasets = tokenized_datasets.map(
503 |                 group_texts,
504 |                 batched=True,
505 |                 num_proc=data_args.preprocessing_num_workers,
506 |                 load_from_cache_file=not data_args.overwrite_cache,
507 |                 desc=f"Grouping texts in chunks of {block_size}",
508 |                 batch_size = 40000,
509 |             )
510 |         else:
511 |             lm_datasets = tokenized_datasets.map(
512 |                 group_texts,
513 |                 batched=True,
514 |                 batch_size = 60000,
515 |             )
516 |     print(training_args.local_rank,'start select train_dataset')
517 |     if training_args.do_train:
518 |         if "train" not in tokenized_datasets:
519 |             raise ValueError("--do_train requires a train dataset")
520 |         train_dataset = lm_datasets["train"]
521 |         if data_args.max_train_samples is not None and data_args.streaming==False:
522 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
523 |             train_dataset = train_dataset.select(range(max_train_samples))
524 |     print(training_args.local_rank,'end select train_dataset')
525 | 
526 |     if training_args.do_eval:
527 |         if "validation" not in tokenized_datasets:
528 |             raise ValueError("--do_eval requires a validation dataset")
529 |         print(training_args.local_rank,'start select eval_dataset')
530 |         eval_dataset = lm_datasets["validation"]
531 |         if data_args.max_eval_samples is not None and data_args.streaming==False :
532 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
533 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
534 |         print(training_args.local_rank,'end select eval_dataset')
535 |         def preprocess_logits_for_metrics(logits, labels):
536 |             if isinstance(logits, tuple):
537 |                 # Depending on the model and config, logits may contain extra tensors,
538 |                 # like past_key_values, but logits always come first
539 |                 logits = logits[0]
540 |             return logits.argmax(dim=-1)
541 |         print(training_args.local_rank,'start load metric')
542 |         metric = evaluate.load("accuracy.py")
543 |         print(training_args.local_rank,'end load metric')
544 | 
545 |         def compute_metrics(eval_preds):
546 |             preds, labels = eval_preds
547 |             # preds have the same shape as the labels, after the argmax(-1) has been calculated
548 |             # by preprocess_logits_for_metrics but we need to shift the labels
549 |             labels = labels[:, 1:].reshape(-1)
550 |             preds = preds[:, :-1].reshape(-1)
551 |             return metric.compute(predictions=preds, references=labels)
552 |     
553 |     print(training_args.local_rank,'Initialize our Trainer')
554 |     trainer = Trainer(
555 |         model=model,
556 |         args=training_args,
557 |         train_dataset= IterableWrapper(train_dataset) if training_args.do_train else None,
558 |         eval_dataset= IterableWrapper(eval_dataset) if training_args.do_eval else None,
559 |         tokenizer=tokenizer,
560 |         # Data collator will default to DataCollatorWithPadding, so we change it.
561 |         data_collator=default_data_collator,
562 |         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
563 |         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None,
564 |         # callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None),
565 |     )
566 |     
567 |     if training_args.do_train:
568 |         checkpoint = None
569 |         if training_args.resume_from_checkpoint is not None:
570 |             checkpoint = training_args.resume_from_checkpoint
571 |         elif last_checkpoint is not None:
572 |             checkpoint = last_checkpoint
573 | 
574 |         print(training_args.local_rank,'start train')
575 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
576 |         trainer.save_model()  # Saves the tokenizer too for easy upload
577 | 
578 |         metrics = train_result.metrics
579 | 
580 |         max_train_samples = (
581 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
582 |         )
583 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
584 | 
585 |         trainer.log_metrics("train", metrics)
586 |         trainer.save_metrics("train", metrics)
587 |         trainer.save_state()
588 | 
589 |     # Evaluation
590 |     if training_args.do_eval:
591 |         logger.info("*** Evaluate ***")
592 | 
593 |         metrics = trainer.evaluate()
594 | 
595 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
596 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
597 |         try:
598 |             perplexity = math.exp(metrics["eval_loss"])
599 |         except OverflowError:
600 |             perplexity = float("inf")
601 |         metrics["perplexity"] = perplexity
602 | 
603 |         trainer.log_metrics("eval", metrics)
604 |         trainer.save_metrics("eval", metrics)
605 | 
606 | 
607 | 
608 | def _mp_fn(index):
609 |     # For xla_spawn (TPUs)
610 |     main()
611 | 
612 | 
613 | if __name__ == "__main__":
614 |     main()
615 | 


--------------------------------------------------------------------------------
/train/sft/accuracy.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Accuracy metric."""
 15 | 
 16 | import datasets
 17 | from sklearn.metrics import accuracy_score
 18 | 
 19 | import evaluate
 20 | 
 21 | 
 22 | _DESCRIPTION = """
 23 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
 24 | Accuracy = (TP + TN) / (TP + TN + FP + FN)
 25 |  Where:
 26 | TP: True positive
 27 | TN: True negative
 28 | FP: False positive
 29 | FN: False negative
 30 | """
 31 | 
 32 | 
 33 | _KWARGS_DESCRIPTION = """
 34 | Args:
 35 |     predictions (`list` of `int`): Predicted labels.
 36 |     references (`list` of `int`): Ground truth labels.
 37 |     normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
 38 |     sample_weight (`list` of `float`): Sample weights Defaults to None.
 39 | 
 40 | Returns:
 41 |     accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
 42 | 
 43 | Examples:
 44 | 
 45 |     Example 1-A simple example
 46 |         >>> accuracy_metric = evaluate.load("accuracy")
 47 |         >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
 48 |         >>> print(results)
 49 |         {'accuracy': 0.5}
 50 | 
 51 |     Example 2-The same as Example 1, except with `normalize` set to `False`.
 52 |         >>> accuracy_metric = evaluate.load("accuracy")
 53 |         >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
 54 |         >>> print(results)
 55 |         {'accuracy': 3.0}
 56 | 
 57 |     Example 3-The same as Example 1, except with `sample_weight` set.
 58 |         >>> accuracy_metric = evaluate.load("accuracy")
 59 |         >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
 60 |         >>> print(results)
 61 |         {'accuracy': 0.8778625954198473}
 62 | """
 63 | 
 64 | 
 65 | _CITATION = """
 66 | @article{scikit-learn,
 67 |   title={Scikit-learn: Machine Learning in {P}ython},
 68 |   author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
 69 |          and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
 70 |          and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
 71 |          Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 72 |   journal={Journal of Machine Learning Research},
 73 |   volume={12},
 74 |   pages={2825--2830},
 75 |   year={2011}
 76 | }
 77 | """
 78 | 
 79 | 
 80 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 81 | class Accuracy(evaluate.Metric):
 82 |     def _info(self):
 83 |         return evaluate.MetricInfo(
 84 |             description=_DESCRIPTION,
 85 |             citation=_CITATION,
 86 |             inputs_description=_KWARGS_DESCRIPTION,
 87 |             features=datasets.Features(
 88 |                 {
 89 |                     "predictions": datasets.Sequence(datasets.Value("int32")),
 90 |                     "references": datasets.Sequence(datasets.Value("int32")),
 91 |                 }
 92 |                 if self.config_name == "multilabel"
 93 |                 else {
 94 |                     "predictions": datasets.Value("int32"),
 95 |                     "references": datasets.Value("int32"),
 96 |                 }
 97 |             ),
 98 |             reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
 99 |         )
100 | 
101 |     def _compute(self, predictions, references, normalize=True, sample_weight=None):
102 |         return {
103 |             "accuracy": float(
104 |                 accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
105 |             )
106 |         }
107 | 


--------------------------------------------------------------------------------
/train/sft/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "optimizer": {
11 |         "type": "AdamW",
12 |         "params": {
13 |             "lr": "auto",
14 |             "betas": "auto",
15 |             "eps": "auto",
16 |             "weight_decay": "auto"
17 |         }
18 |     },
19 | 
20 |     "scheduler": {
21 |         "type": "WarmupDecayLR",
22 |         "params": {
23 |             "last_batch_iteration": -1,
24 |             "total_num_steps": "auto",
25 |             "warmup_min_lr": "auto",
26 |             "warmup_max_lr": "auto",
27 |             "warmup_num_steps": "auto"
28 |         }
29 |     },
30 | 
31 |     "zero_optimization": {
32 |         "stage": 2,
33 |         "offload_optimizer": {
34 |             "device": "cpu",
35 |             "pin_memory": true
36 |         },
37 |         "offload_param": {
38 |             "device": "cpu",
39 |             "pin_memory": true
40 |         },
41 |         "allgather_partitions": true,
42 |         "allgather_bucket_size": 5e8,
43 |         "overlap_comm": true,
44 |         "reduce_scatter": true,
45 |         "reduce_bucket_size": 5e8,
46 |         "contiguous_gradients": true
47 |     },
48 |     "activation_checkpointing": {
49 |         "partition_activations": false,
50 |         "cpu_checkpointing": false,
51 |         "contiguous_memory_optimization": false,
52 |         "number_checkpoints": null,
53 |         "synchronize_checkpoint_boundary": false,
54 |         "profile": false
55 |     },
56 |     "gradient_accumulation_steps": "auto",
57 |     "gradient_clipping": "auto",
58 |     "steps_per_print": 2000,
59 |     "train_batch_size": "auto",
60 |     "min_lr": 5e-7,
61 |     "train_micro_batch_size_per_gpu": "auto",
62 |     "wall_clock_breakdown": false
63 | }


--------------------------------------------------------------------------------
/train/sft/finetune.sh:
--------------------------------------------------------------------------------
 1 | output_model=save_folder
 2 | # 需要修改到自己的输入目录
 3 | if [ ! -d ${output_model} ];then  
 4 |     mkdir ${output_model}
 5 | fi
 6 | cp ./finetune.sh ${output_model}
 7 | deepspeed --include localhost:1,0 finetune_clm.py \
 8 |     --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
 9 |     --train_files ../../data/train_sft.csv \
10 |     --validation_files  ../../data/dev_sft.csv \
11 |                          ../../data/dev_sft_sharegpt.csv \
12 |     --per_device_train_batch_size 1 \
13 |     --per_device_eval_batch_size 1 \
14 |     --do_train \
15 |     --do_eval \
16 |     --use_fast_tokenizer false \
17 |     --output_dir ${output_model} \
18 |     --evaluation_strategy  steps \
19 |     --max_eval_samples 800 \
20 |     --learning_rate 1e-4 \
21 |     --gradient_accumulation_steps 8 \
22 |     --num_train_epochs 10 \
23 |     --warmup_steps 400 \
24 |     --logging_dir ${output_model}/logs \
25 |     --logging_strategy steps \
26 |     --logging_steps 10 \
27 |     --save_strategy steps \
28 |     --preprocessing_num_workers 10 \
29 |     --save_steps 20 \
30 |     --eval_steps 20 \
31 |     --save_total_limit 2000 \
32 |     --seed 42 \
33 |     --disable_tqdm false \
34 |     --ddp_find_unused_parameters false \
35 |     --block_size 2048 \
36 |     --report_to tensorboard \
37 |     --overwrite_output_dir \
38 |     --deepspeed ds_config_zero2.json \
39 |     --ignore_data_skip true \
40 |     --bf16 \
41 |     --gradient_checkpointing \
42 |     --bf16_full_eval \
43 |     --ddp_timeout 18000000 \
44 |     | tee -a ${output_model}/train.log
45 |     
46 | 
47 | 
48 |     # --resume_from_checkpoint ${output_model}/checkpoint-20400 \
49 | 


--------------------------------------------------------------------------------
/train/sft/finetune_clm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
 18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 19 | https://huggingface.co/models?filter=text-generation
 20 | """
 21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 22 | 
 23 | import logging
 24 | import math
 25 | import os
 26 | import sys
 27 | import random
 28 | from dataclasses import dataclass, field
 29 | from itertools import chain
 30 | import deepspeed
 31 | from typing import Optional,List,Union
 32 | 
 33 | import datasets
 34 | import evaluate
 35 | import torch
 36 | from datasets import load_dataset
 37 | from peft import (  # noqa: E402
 38 |     LoraConfig,
 39 |     PeftModel,
 40 |     get_peft_model,
 41 |     get_peft_model_state_dict,
 42 |     prepare_model_for_int8_training,
 43 |     prepare_model_for_kbit_training,
 44 |     set_peft_model_state_dict,
 45 | )
 46 | import transformers
 47 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 48 | from transformers import (
 49 |     CONFIG_MAPPING,
 50 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 51 |     AutoConfig,
 52 |     AutoModelForCausalLM,
 53 |     AutoTokenizer,
 54 |     TrainerCallback,
 55 |     TrainerState,
 56 |     TrainerControl,
 57 |     HfArgumentParser,
 58 |     Trainer,
 59 |     TrainingArguments,
 60 |     default_data_collator,
 61 |     BitsAndBytesConfig,
 62 |     is_torch_tpu_available,
 63 |     set_seed,
 64 | )
 65 | from transformers.testing_utils import CaptureLogger
 66 | from transformers.trainer_utils import get_last_checkpoint
 67 | from transformers.utils import check_min_version, send_example_telemetry
 68 | from transformers.utils.versions import require_version
 69 | 
 70 | import pdb
 71 | 
 72 | 
 73 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 74 | # check_min_version("4.27.0.dev0")
 75 | 
 76 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 77 | 
 78 | logger = logging.getLogger(__name__)
 79 | 
 80 | 
 81 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 82 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 83 | 
 84 | 
 85 | @dataclass
 86 | class ModelArguments:
 87 |     """
 88 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 89 |     """
 90 | 
 91 |     model_name_or_path: Optional[str] = field(
 92 |         default=None,
 93 |         metadata={
 94 |             "help": (
 95 |                 "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
 96 |             )
 97 |         },
 98 |     )
 99 |     model_type: Optional[str] = field(
100 |         default=None,
101 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
102 |     )
103 |     config_overrides: Optional[str] = field(
104 |         default=None,
105 |         metadata={
106 |             "help": (
107 |                 "Override some existing default config settings when a model is trained from scratch. Example: "
108 |                 "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
109 |             )
110 |         },
111 |     )
112 |     config_name: Optional[str] = field(
113 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
114 |     )
115 |     tokenizer_name: Optional[str] = field(
116 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
117 |     )
118 |     cache_dir: Optional[str] = field(
119 |         default=None,
120 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
121 |     )
122 |     use_fast_tokenizer: bool = field(
123 |         default=True,
124 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
125 |     )
126 |     model_revision: str = field(
127 |         default="main",
128 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
129 |     )
130 |     use_auth_token: bool = field(
131 |         default=False,
132 |         metadata={
133 |             "help": (
134 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
135 |                 "with private models)."
136 |             )
137 |         },
138 |     )
139 |     
140 |     torch_dtype: Optional[str] = field(
141 |         default=None,
142 |         metadata={
143 |             "help": (
144 |                 "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
145 |                 "dtype will be automatically derived from the model's weights."
146 |             ),
147 |             "choices": ["auto", "bfloat16", "float16", "float32"],
148 |         },
149 |     )
150 | 
151 |     def __post_init__(self):
152 |         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
153 |             raise ValueError(
154 |                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
155 |             )
156 |         
157 | 
158 | 
159 | @dataclass
160 | class DataTrainingArguments:
161 |     """
162 |     Arguments pertaining to what data we are going to input our model for training and eval.
163 |     """
164 |     train_on_inputs: bool = field(
165 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
166 |     )
167 |     dataset_name: Optional[str] = field(
168 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
169 |     )
170 |     dataset_config_name: Optional[str] = field(
171 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
172 |     )
173 |     train_files: Optional[List[str]]  = field(default=None, metadata={"help": "The input training data file (a text file)."})
174 |     validation_files: Optional[List[str]]  = field(
175 |         default=None,
176 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
177 |     )
178 |     max_train_samples: Optional[int] = field(
179 |         default=None,
180 |         metadata={
181 |             "help": (
182 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
183 |                 "value if set."
184 |             )
185 |         },
186 |     )
187 |     max_eval_samples: Optional[int] = field(
188 |         default=None,
189 |         metadata={
190 |             "help": (
191 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
192 |                 "value if set."
193 |             )
194 |         },
195 |     )
196 |     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
197 |     block_size: Optional[int] = field(
198 |         default=None,
199 |         metadata={
200 |             "help": (
201 |                 "Optional input sequence length after tokenization. "
202 |                 "The training dataset will be truncated in block of this size for training. "
203 |                 "Default to the model max input length for single sentence inputs (take into account special tokens)."
204 |             )
205 |         },
206 |     )
207 |     overwrite_cache: bool = field(
208 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
209 |     )
210 |     validation_split_percentage: Optional[int] = field(
211 |         default=5,
212 |         metadata={
213 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
214 |         },
215 |     )
216 |     preprocessing_num_workers: Optional[int] = field(
217 |         default=None,
218 |         metadata={"help": "The number of processes to use for the preprocessing."},
219 |     )
220 |     keep_linebreaks: bool = field(
221 |         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
222 |     )
223 | 
224 |     def __post_init__(self):
225 |         if self.streaming:
226 |             require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
227 | 
228 |         if self.dataset_name is None and self.train_files is None and self.validation_files is None:
229 |             raise ValueError("Need either a dataset name or a training/validation file.")
230 |         else:
231 |             if self.train_files is not None:
232 |                 extension = self.train_files[0].split(".")[-1]
233 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
234 |             if self.validation_files is not None:
235 |                 extension = self.validation_files[0].split(".")[-1]
236 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
237 |                 
238 | def main():
239 |     # See all possible arguments in src/transformers/training_args.py
240 |     # or by passing the --help flag to this script.
241 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
242 | 
243 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
244 |     # pdb.set_trace()
245 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
246 |         # If we pass only one argument to the script and it's the path to a json file,
247 |         # let's parse it to get our arguments.
248 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
249 |     else:
250 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
251 | 
252 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
253 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
254 |     send_example_telemetry("run_clm", model_args, data_args)
255 | 
256 |     # Setup logging
257 |     logging.basicConfig(
258 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
259 |         datefmt="%m/%d/%Y %H:%M:%S",
260 |         handlers=[logging.StreamHandler(sys.stdout)],
261 |     )
262 | 
263 |     if training_args.should_log:
264 |         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
265 |         transformers.utils.logging.set_verbosity_info()
266 | 
267 |     log_level = training_args.get_process_log_level()
268 |     logger.setLevel(log_level)
269 |     datasets.utils.logging.set_verbosity(log_level)
270 |     transformers.utils.logging.set_verbosity(log_level)
271 |     transformers.utils.logging.enable_default_handler()
272 |     transformers.utils.logging.enable_explicit_format()
273 | 
274 |     # Log on each process the small summary:
275 |     logger.warning(
276 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
277 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
278 |     )
279 |     logger.info(f"Training/evaluation parameters {training_args}")
280 | 
281 |     # Detecting last checkpoint.
282 |     last_checkpoint = None
283 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
284 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
285 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
286 |             raise ValueError(
287 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
288 |                 "Use --overwrite_output_dir to overcome."
289 |             )
290 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
291 |             logger.info(
292 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
293 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
294 |             )
295 | 
296 |     # Set seed before initializing model.
297 |     set_seed(training_args.seed)
298 | 
299 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
300 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
301 |     # (the dataset will be downloaded automatically from the datasets Hub).
302 |     #
303 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
304 |     # 'text' is found. You can easily tweak this behavior (see below).
305 |     #
306 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
307 |     # download the dataset.
308 |     if True:
309 |         data_files = {}
310 |         dataset_args = {}
311 |         if data_args.train_files is not None:
312 |             data_files["train"] = data_args.train_files
313 |         if data_args.validation_files is not None:
314 |             data_files["validation"] = data_args.validation_files
315 |         extension = (
316 |             data_args.train_files[0].split(".")[-1]
317 |             if data_args.train_files is not None
318 |             else data_args.validation_files.split(".")[-1]
319 |         )
320 |         if extension == "txt":
321 |             extension = "text"
322 |             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
323 |         raw_datasets = load_dataset(
324 |             extension,
325 |             data_files=data_files,
326 |             cache_dir=os.path.join(training_args.output_dir,'dataset_cache'),
327 |             use_auth_token=True if model_args.use_auth_token else None,
328 |             **dataset_args,
329 |         )
330 |         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
331 |         if "validation" not in raw_datasets.keys():
332 |             raw_datasets["validation"] = load_dataset(
333 |                 extension,
334 |                 data_files=data_files,
335 |                 split=f"train[:{data_args.validation_split_percentage}%]",
336 |                 cache_dir=model_args.cache_dir,
337 |                 use_auth_token=True if model_args.use_auth_token else None,
338 |                 **dataset_args,
339 |             )
340 |             raw_datasets["train"] = load_dataset(
341 |                 extension,
342 |                 data_files=data_files,
343 |                 split=f"train[{data_args.validation_split_percentage}%:]",
344 |                 cache_dir=model_args.cache_dir,
345 |                 use_auth_token=True if model_args.use_auth_token else None,
346 |                 **dataset_args,
347 |             )
348 | 
349 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
350 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
351 | 
352 |     # Load pretrained model and tokenizer
353 |     #
354 |     # Distributed training:
355 |     # The .from_pretrained methods guarantee that only one local process can concurrently
356 |     # download model & vocab.
357 | 
358 |     config_kwargs = {
359 |         "cache_dir": model_args.cache_dir,
360 |         "revision": model_args.model_revision,
361 |         "use_auth_token": True if model_args.use_auth_token else None,
362 |     }
363 |     if model_args.config_name:
364 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
365 |     elif model_args.model_name_or_path:
366 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
367 |     else:
368 |         config = CONFIG_MAPPING[model_args.model_type]()
369 |         logger.warning("You are instantiating a new config instance from scratch.")
370 |         if model_args.config_overrides is not None:
371 |             logger.info(f"Overriding config: {model_args.config_overrides}")
372 |             config.update_from_string(model_args.config_overrides)
373 |             logger.info(f"New config: {config}")
374 | 
375 |     tokenizer_kwargs = {
376 |         "cache_dir": model_args.cache_dir,
377 |         "use_fast": model_args.use_fast_tokenizer,
378 |         "revision": model_args.model_revision,
379 |         "use_auth_token": True if model_args.use_auth_token else None,
380 |         "padding_side":'left'
381 |     }
382 |     if model_args.tokenizer_name:
383 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
384 |     elif model_args.model_name_or_path:
385 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
386 |     else:
387 |         raise ValueError(
388 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
389 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
390 |         )
391 |     tokenizer.pad_token = tokenizer.eos_token
392 |     if model_args.model_name_or_path:
393 |         torch_dtype = (
394 |             model_args.torch_dtype
395 |             if model_args.torch_dtype in ["auto", None]
396 |             else getattr(torch, model_args.torch_dtype)
397 |         )
398 |         print(torch_dtype)
399 |         torch_dtype = torch.float16
400 |         model = AutoModelForCausalLM.from_pretrained(
401 |             model_args.model_name_or_path,
402 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
403 |             config=config,
404 |             cache_dir=model_args.cache_dir,
405 |             revision=model_args.model_revision,
406 |             use_auth_token=True if model_args.use_auth_token else None,
407 |             torch_dtype=torch_dtype,
408 |             trust_remote_code=True,
409 |             use_flash_attention_2=True,
410 |             device_map={"": int(os.environ.get("LOCAL_RANK") or 0)}
411 |         )
412 |         # model = prepare_model_for_int8_training(model, output_embedding_layer_name="embed_out", layer_norm_names=[])
413 |         
414 |     else:
415 |         model = AutoModelForCausalLM.from_config(config)
416 |         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
417 |         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
418 | 
419 |     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
420 |     # on a small vocab and want a smaller embedding size, remove this test.
421 |     embedding_size = model.get_input_embeddings().weight.shape[0]
422 |     if len(tokenizer) > embedding_size:
423 |         model.resize_token_embeddings(len(tokenizer))
424 |     
425 |     # Preprocessing the datasets.
426 |     # First we tokenize all the texts.
427 |     if training_args.do_train:
428 |         column_names = list(raw_datasets["train"].features)
429 |     else:
430 |         column_names = list(raw_datasets["validation"].features)
431 |         
432 |     train_on_inputs = True
433 |     if len(column_names)==1:
434 |         text_column_name = "text" if "text" in column_names else column_names[0]
435 |     elif len(column_names)==2:
436 |         input_column_name = 'input' if 'input' in column_names else column_names[0]
437 |         target_column_name = 'target' if 'target' in column_names else column_names[0]
438 |         train_on_inputs=False
439 |     else:
440 |         raise ValueError('输入文件列数不对')
441 |     print('train_on_inputs',train_on_inputs)
442 |     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
443 |     tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
444 | 
445 |     def tokenize_function(examples):
446 |         with CaptureLogger(tok_logger) as cl:
447 |             output = tokenizer([ item for item in examples[text_column_name]],truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
448 |             output['labels'] = output['input_ids'].copy()
449 |         return output
450 | 
451 |     def tokenize(prompt):
452 |         result = tokenizer(prompt,truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
453 |         result["labels"] = result["input_ids"].copy()
454 |         return result
455 | 
456 |     def generate_and_tokenize_prompt(data_point):
457 |         input_text = data_point[input_column_name]
458 |         target_text = data_point[target_column_name]
459 |         full_prompt = input_text+target_text
460 |         tokenized_full_prompt = tokenize(full_prompt)
461 |         if not train_on_inputs:
462 |             user_prompt = input_text
463 |             tokenized_user_prompt = tokenize(user_prompt)
464 |             user_prompt_len = len(tokenized_user_prompt["input_ids"])
465 |             tokenized_full_prompt["labels"] = [
466 |                 -100
467 |             ] * user_prompt_len + tokenized_full_prompt["labels"][
468 |                 user_prompt_len:
469 |             ] 
470 |         return tokenized_full_prompt
471 |     
472 |     
473 |     
474 |     with training_args.main_process_first(desc="dataset map tokenization"):
475 |         if not data_args.streaming:
476 |             tokenized_datasets = raw_datasets.map(
477 |                 tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
478 |                 batched=True if train_on_inputs==True else False,
479 |                 num_proc=data_args.preprocessing_num_workers,
480 |                 remove_columns=column_names,
481 |                 load_from_cache_file=not data_args.overwrite_cache,
482 |                 desc="Running tokenizer on dataset",
483 |             )
484 |         else:
485 |             tokenized_datasets = raw_datasets.map(
486 |                 tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
487 |                 batched=True if train_on_inputs==True else False,
488 |                 remove_columns=column_names,
489 |             )
490 | 
491 |     if data_args.block_size is None:
492 |         block_size = tokenizer.model_max_length
493 |         if block_size > 2048:
494 |             block_size = 2048
495 |     else:
496 |         block_size = min(data_args.block_size, tokenizer.model_max_length)
497 | 
498 |     if training_args.do_train:
499 |         if "train" not in tokenized_datasets:
500 |             raise ValueError("--do_train requires a train dataset")
501 |         train_dataset = tokenized_datasets["train"]
502 |         if data_args.max_train_samples is not None:
503 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
504 |             train_dataset = train_dataset.select(range(max_train_samples))
505 |         for index in random.sample(range(len(train_dataset)), 3):
506 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
507 |         train_dataset = train_dataset.shuffle(seed=training_args.seed)
508 | 
509 |     if training_args.do_eval:
510 |         if "validation" not in tokenized_datasets:
511 |             raise ValueError("--do_eval requires a validation dataset")
512 |         eval_dataset = tokenized_datasets["validation"]
513 |         if data_args.max_eval_samples is not None:
514 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
515 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
516 | 
517 |         def preprocess_logits_for_metrics(logits, labels):
518 |             if isinstance(logits, tuple):
519 |                 # Depending on the model and config, logits may contain extra tensors,
520 |                 # like past_key_values, but logits always come first
521 |                 logits = logits[0]
522 |             return logits.argmax(dim=-1)
523 | 
524 |         metric = evaluate.load("accuracy.py")
525 | 
526 |         def compute_metrics(eval_preds):
527 |             preds, labels = eval_preds
528 |             # preds have the same shape as the labels, after the argmax(-1) has been calculated
529 |             # by preprocess_logits_for_metrics but we need to shift the labels
530 |             labels = labels[:, 1:].reshape(-1)
531 |             # .reshape(-1)
532 |             preds = preds[:, :-1].reshape(-1)
533 |             # .reshape(-1)
534 |             # print(labels.shape)
535 |             # true_predictions = [
536 |             #     [p for (p, l) in zip(pred, gold_label) if l != -100]
537 |             #     for pred, gold_label in zip(preds, labels)
538 |             # ]
539 |             # true_labels = [
540 |             #     [l for (p, l) in zip(pred, gold_label) if l != -100]
541 |             #     for pred, gold_label in zip(preds, labels)
542 |             # ]            
543 |             # preds = np.array(true_predictions).reshape(-1)
544 |             # labels = np.array(true_labels).reshape(-1)
545 |             return metric.compute(predictions=preds, references=labels)
546 | 
547 |     # Initialize our Trainer
548 |     trainer = Trainer(
549 |         model=model,
550 |         args=training_args,
551 |         train_dataset=train_dataset if training_args.do_train else None,
552 |         eval_dataset=eval_dataset if training_args.do_eval else None,
553 |         tokenizer=tokenizer,
554 |         # Data collator will default to DataCollatorWithPadding, so we change it.
555 |         data_collator=transformers.DataCollatorForSeq2Seq(
556 |             tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
557 |         ),
558 |         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
559 |         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None,
560 |     )
561 | 
562 |     # Training
563 |     if training_args.do_train:
564 |         checkpoint = None
565 |         if training_args.resume_from_checkpoint is not None:
566 |             checkpoint = training_args.resume_from_checkpoint
567 |         elif last_checkpoint is not None:
568 |             checkpoint = last_checkpoint
569 | 
570 |         print(training_args.local_rank,'start train')
571 |         
572 |         if torch.__version__ >= "2" and sys.platform != "win32":
573 |             model = torch.compile(model)
574 |         
575 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
576 |         trainer.save_model()  # Saves the tokenizer too for easy upload
577 | 
578 |         metrics = train_result.metrics
579 | 
580 |         max_train_samples = (
581 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
582 |         )
583 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
584 | 
585 |         trainer.log_metrics("train", metrics)
586 |         trainer.save_metrics("train", metrics)
587 |         trainer.save_state()
588 | 
589 |     # Evaluation
590 |     if training_args.do_eval:
591 |         logger.info("*** Evaluate ***")
592 | 
593 |         metrics = trainer.evaluate()
594 | 
595 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
596 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
597 |         try:
598 |             perplexity = math.exp(metrics["eval_loss"])
599 |         except OverflowError:
600 |             perplexity = float("inf")
601 |         metrics["perplexity"] = perplexity
602 | 
603 |         trainer.log_metrics("eval", metrics)
604 |         trainer.save_metrics("eval", metrics)
605 | 
606 | 
607 | 
608 | def _mp_fn(index):
609 |     # For xla_spawn (TPUs)
610 |     main()
611 | 
612 | 
613 | if __name__ == "__main__":
614 |     main()
615 | 


--------------------------------------------------------------------------------
/train/sft/finetune_clm_lora.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
 18 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 19 | https://huggingface.co/models?filter=text-generation
 20 | """
 21 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 22 | 
 23 | import logging
 24 | import math
 25 | import os
 26 | import sys
 27 | import random
 28 | from dataclasses import dataclass, field
 29 | from itertools import chain
 30 | import deepspeed
 31 | from typing import Optional,List,Union
 32 | 
 33 | import datasets
 34 | import evaluate
 35 | import torch
 36 | from datasets import load_dataset
 37 | from peft import (  # noqa: E402
 38 |     LoraConfig,
 39 |     PeftModel,
 40 |     get_peft_model,
 41 |     get_peft_model_state_dict,
 42 |     prepare_model_for_int8_training,
 43 |     prepare_model_for_kbit_training,
 44 |     set_peft_model_state_dict,
 45 | )
 46 | import transformers
 47 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 48 | from transformers import (
 49 |     CONFIG_MAPPING,
 50 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 51 |     AutoConfig,
 52 |     AutoModelForCausalLM,
 53 |     AutoTokenizer,
 54 |     TrainerCallback,
 55 |     TrainerState,
 56 |     TrainerControl,
 57 |     HfArgumentParser,
 58 |     Trainer,
 59 |     TrainingArguments,
 60 |     default_data_collator,
 61 |     BitsAndBytesConfig,
 62 |     is_torch_tpu_available,
 63 |     set_seed,
 64 | )
 65 | from transformers.testing_utils import CaptureLogger
 66 | from transformers.trainer_utils import get_last_checkpoint
 67 | from transformers.utils import check_min_version, send_example_telemetry
 68 | from transformers.utils.versions import require_version
 69 | 
 70 | import pdb
 71 | 
 72 | 
 73 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 74 | # check_min_version("4.27.0.dev0")
 75 | 
 76 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 77 | 
 78 | logger = logging.getLogger(__name__)
 79 | 
 80 | 
 81 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 82 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 83 | 
 84 | 
 85 | @dataclass
 86 | class ModelArguments:
 87 |     """
 88 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 89 |     """
 90 | 
 91 |     model_name_or_path: Optional[str] = field(
 92 |         default=None,
 93 |         metadata={
 94 |             "help": (
 95 |                 "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
 96 |             )
 97 |         },
 98 |     )
 99 |     model_type: Optional[str] = field(
100 |         default=None,
101 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
102 |     )
103 |     config_overrides: Optional[str] = field(
104 |         default=None,
105 |         metadata={
106 |             "help": (
107 |                 "Override some existing default config settings when a model is trained from scratch. Example: "
108 |                 "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
109 |             )
110 |         },
111 |     )
112 |     config_name: Optional[str] = field(
113 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
114 |     )
115 |     tokenizer_name: Optional[str] = field(
116 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
117 |     )
118 |     cache_dir: Optional[str] = field(
119 |         default=None,
120 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
121 |     )
122 |     lora_r: Optional[int] = field(default=16)
123 |     lora_alpha: Optional[int] = field(default=32)
124 |     target_modules: Optional[str] = field(
125 |         default='q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj',
126 |         metadata={
127 |             "help": "List of module names or regex expression of the module names to replace with Lora."
128 |             "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
129 |         },
130 |     )
131 |     use_fast_tokenizer: bool = field(
132 |         default=True,
133 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
134 |     )
135 |     load_in_bits: Optional[int] = field(default=8)
136 |     model_revision: str = field(
137 |         default="main",
138 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
139 |     )
140 |     use_auth_token: bool = field(
141 |         default=False,
142 |         metadata={
143 |             "help": (
144 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
145 |                 "with private models)."
146 |             )
147 |         },
148 |     )
149 |     
150 |     torch_dtype: Optional[str] = field(
151 |         default=None,
152 |         metadata={
153 |             "help": (
154 |                 "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
155 |                 "dtype will be automatically derived from the model's weights."
156 |             ),
157 |             "choices": ["auto", "bfloat16", "float16", "float32"],
158 |         },
159 |     )
160 | 
161 |     def __post_init__(self):
162 |         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
163 |             raise ValueError(
164 |                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
165 |             )
166 |         if type(self.target_modules)==str:
167 |             self.target_modules = self.target_modules.split(',')
168 | 
169 | 
170 | @dataclass
171 | class DataTrainingArguments:
172 |     """
173 |     Arguments pertaining to what data we are going to input our model for training and eval.
174 |     """
175 |     train_on_inputs: bool = field(
176 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
177 |     )
178 |     dataset_name: Optional[str] = field(
179 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
180 |     )
181 |     dataset_config_name: Optional[str] = field(
182 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
183 |     )
184 |     train_files: Optional[List[str]]  = field(default=None, metadata={"help": "The input training data file (a text file)."})
185 |     validation_files: Optional[List[str]]  = field(
186 |         default=None,
187 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
188 |     )
189 |     max_train_samples: Optional[int] = field(
190 |         default=None,
191 |         metadata={
192 |             "help": (
193 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
194 |                 "value if set."
195 |             )
196 |         },
197 |     )
198 |     max_eval_samples: Optional[int] = field(
199 |         default=None,
200 |         metadata={
201 |             "help": (
202 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
203 |                 "value if set."
204 |             )
205 |         },
206 |     )
207 |     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
208 |     block_size: Optional[int] = field(
209 |         default=None,
210 |         metadata={
211 |             "help": (
212 |                 "Optional input sequence length after tokenization. "
213 |                 "The training dataset will be truncated in block of this size for training. "
214 |                 "Default to the model max input length for single sentence inputs (take into account special tokens)."
215 |             )
216 |         },
217 |     )
218 |     overwrite_cache: bool = field(
219 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
220 |     )
221 |     validation_split_percentage: Optional[int] = field(
222 |         default=5,
223 |         metadata={
224 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
225 |         },
226 |     )
227 |     preprocessing_num_workers: Optional[int] = field(
228 |         default=None,
229 |         metadata={"help": "The number of processes to use for the preprocessing."},
230 |     )
231 |     keep_linebreaks: bool = field(
232 |         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
233 |     )
234 | 
235 |     def __post_init__(self):
236 |         if self.streaming:
237 |             require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
238 | 
239 |         if self.dataset_name is None and self.train_files is None and self.validation_files is None:
240 |             raise ValueError("Need either a dataset name or a training/validation file.")
241 |         else:
242 |             if self.train_files is not None:
243 |                 extension = self.train_files[0].split(".")[-1]
244 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
245 |             if self.validation_files is not None:
246 |                 extension = self.validation_files[0].split(".")[-1]
247 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
248 |                 
249 | class SavePeftModelCallback(TrainerCallback):
250 |     def on_save(
251 |         self,
252 |         args: TrainingArguments,
253 |         state: TrainerState,
254 |         control: TrainerControl,
255 |         **kwargs,
256 |     ):
257 |         if state.is_world_process_zero:
258 |             print('+++++++++++++++++save call back++++++++++++++++')
259 |             checkpoint_folder = os.path.join(
260 |                 args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
261 |             )
262 |             kwargs["model"].save_pretrained(checkpoint_folder)
263 | 
264 |             pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
265 |             if os.path.exists(pytorch_model_path):
266 |                 os.remove(pytorch_model_path)
267 |             return control
268 | 
269 | def main():
270 |     # See all possible arguments in src/transformers/training_args.py
271 |     # or by passing the --help flag to this script.
272 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
273 | 
274 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
275 |     # pdb.set_trace()
276 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
277 |         # If we pass only one argument to the script and it's the path to a json file,
278 |         # let's parse it to get our arguments.
279 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
280 |     else:
281 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
282 | 
283 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
284 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
285 |     send_example_telemetry("run_clm", model_args, data_args)
286 | 
287 |     # Setup logging
288 |     logging.basicConfig(
289 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
290 |         datefmt="%m/%d/%Y %H:%M:%S",
291 |         handlers=[logging.StreamHandler(sys.stdout)],
292 |     )
293 | 
294 |     if training_args.should_log:
295 |         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
296 |         transformers.utils.logging.set_verbosity_info()
297 | 
298 |     log_level = training_args.get_process_log_level()
299 |     logger.setLevel(log_level)
300 |     datasets.utils.logging.set_verbosity(log_level)
301 |     transformers.utils.logging.set_verbosity(log_level)
302 |     transformers.utils.logging.enable_default_handler()
303 |     transformers.utils.logging.enable_explicit_format()
304 | 
305 |     # Log on each process the small summary:
306 |     logger.warning(
307 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
308 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
309 |     )
310 |     logger.info(f"Training/evaluation parameters {training_args}")
311 | 
312 |     # Detecting last checkpoint.
313 |     last_checkpoint = None
314 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
315 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
316 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
317 |             raise ValueError(
318 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
319 |                 "Use --overwrite_output_dir to overcome."
320 |             )
321 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
322 |             logger.info(
323 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
324 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
325 |             )
326 | 
327 |     # Set seed before initializing model.
328 |     set_seed(training_args.seed)
329 | 
330 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
331 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
332 |     # (the dataset will be downloaded automatically from the datasets Hub).
333 |     #
334 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
335 |     # 'text' is found. You can easily tweak this behavior (see below).
336 |     #
337 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
338 |     # download the dataset.
339 |     if True:
340 |         data_files = {}
341 |         dataset_args = {}
342 |         if data_args.train_files is not None:
343 |             data_files["train"] = data_args.train_files
344 |         if data_args.validation_files is not None:
345 |             data_files["validation"] = data_args.validation_files
346 |         extension = (
347 |             data_args.train_files[0].split(".")[-1]
348 |             if data_args.train_files is not None
349 |             else data_args.validation_files.split(".")[-1]
350 |         )
351 |         if extension == "txt":
352 |             extension = "text"
353 |             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
354 |         raw_datasets = load_dataset(
355 |             extension,
356 |             data_files=data_files,
357 |             cache_dir=os.path.join(training_args.output_dir,'dataset_cache'),
358 |             use_auth_token=True if model_args.use_auth_token else None,
359 |             **dataset_args,
360 |         )
361 |         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
362 |         if "validation" not in raw_datasets.keys():
363 |             raw_datasets["validation"] = load_dataset(
364 |                 extension,
365 |                 data_files=data_files,
366 |                 split=f"train[:{data_args.validation_split_percentage}%]",
367 |                 cache_dir=model_args.cache_dir,
368 |                 use_auth_token=True if model_args.use_auth_token else None,
369 |                 **dataset_args,
370 |             )
371 |             raw_datasets["train"] = load_dataset(
372 |                 extension,
373 |                 data_files=data_files,
374 |                 split=f"train[{data_args.validation_split_percentage}%:]",
375 |                 cache_dir=model_args.cache_dir,
376 |                 use_auth_token=True if model_args.use_auth_token else None,
377 |                 **dataset_args,
378 |             )
379 | 
380 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
381 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
382 | 
383 |     # Load pretrained model and tokenizer
384 |     #
385 |     # Distributed training:
386 |     # The .from_pretrained methods guarantee that only one local process can concurrently
387 |     # download model & vocab.
388 | 
389 |     config_kwargs = {
390 |         "cache_dir": model_args.cache_dir,
391 |         "revision": model_args.model_revision,
392 |         "use_auth_token": True if model_args.use_auth_token else None,
393 |     }
394 |     if model_args.config_name:
395 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
396 |     elif model_args.model_name_or_path:
397 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
398 |     else:
399 |         config = CONFIG_MAPPING[model_args.model_type]()
400 |         logger.warning("You are instantiating a new config instance from scratch.")
401 |         if model_args.config_overrides is not None:
402 |             logger.info(f"Overriding config: {model_args.config_overrides}")
403 |             config.update_from_string(model_args.config_overrides)
404 |             logger.info(f"New config: {config}")
405 | 
406 |     tokenizer_kwargs = {
407 |         "cache_dir": model_args.cache_dir,
408 |         "use_fast": model_args.use_fast_tokenizer,
409 |         "revision": model_args.model_revision,
410 |         "use_auth_token": True if model_args.use_auth_token else None,
411 |         "padding_side":'left'
412 |     }
413 |     if model_args.tokenizer_name:
414 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
415 |     elif model_args.model_name_or_path:
416 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
417 |     else:
418 |         raise ValueError(
419 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
420 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
421 |         )
422 |     tokenizer.pad_token = tokenizer.eos_token
423 |     lora_config = LoraConfig(
424 |         r=model_args.lora_r,
425 |         lora_alpha=model_args.lora_alpha,
426 |         # target_modules=["query_key_value"],
427 |         # target_modules =  ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
428 |         target_modules =  model_args.target_modules,
429 |         fan_in_fan_out = False,
430 |         lora_dropout=0.05,
431 |         inference_mode=False,
432 |         bias="none",
433 |         task_type="CAUSAL_LM",
434 |     )
435 |     print(lora_config)
436 |     bnb_config = BitsAndBytesConfig(
437 |         load_in_4bit=True,
438 |         bnb_4bit_use_double_quant=True,
439 |         bnb_4bit_quant_type="nf4",
440 |         bnb_4bit_compute_dtype=torch.bfloat16
441 |     )
442 |     if model_args.model_name_or_path:
443 |         torch_dtype = (
444 |             model_args.torch_dtype
445 |             if model_args.torch_dtype in ["auto", None]
446 |             else getattr(torch, model_args.torch_dtype)
447 |         )
448 |         print(torch_dtype)
449 |         torch_dtype = torch.float16
450 |         model = AutoModelForCausalLM.from_pretrained(
451 |             model_args.model_name_or_path,
452 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
453 |             config=config,
454 |             cache_dir=model_args.cache_dir,
455 |             revision=model_args.model_revision,
456 |             use_auth_token=True if model_args.use_auth_token else None,
457 |             torch_dtype=torch_dtype,
458 |             load_in_8bit=True if model_args.load_in_bits==8 else False,
459 |             trust_remote_code=True,
460 |             use_flash_attention_2=True,
461 |             quantization_config=bnb_config if model_args.load_in_bits==4 else None,
462 |             # device_map  = 'auto'
463 |             device_map={"": int(os.environ.get("LOCAL_RANK") or 0)}
464 |         )
465 |         # model = prepare_model_for_int8_training(model, output_embedding_layer_name="embed_out", layer_norm_names=[])
466 |         
467 |     else:
468 |         model = AutoModelForCausalLM.from_config(config)
469 |         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
470 |         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
471 | 
472 |     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
473 |     # on a small vocab and want a smaller embedding size, remove this test.
474 |     embedding_size = model.get_input_embeddings().weight.shape[0]
475 |     if len(tokenizer) > embedding_size:
476 |         model.resize_token_embeddings(len(tokenizer))
477 |     if model_args.load_in_bits==8:
478 |         model = prepare_model_for_int8_training(model)
479 |     elif model_args.load_in_bits==4:
480 |         model = prepare_model_for_kbit_training(model)
481 |     
482 |     # Preprocessing the datasets.
483 |     # First we tokenize all the texts.
484 |     if training_args.do_train:
485 |         column_names = list(raw_datasets["train"].features)
486 |     else:
487 |         column_names = list(raw_datasets["validation"].features)
488 |         
489 |     train_on_inputs = True
490 |     if len(column_names)==1:
491 |         text_column_name = "text" if "text" in column_names else column_names[0]
492 |     elif len(column_names)==2:
493 |         input_column_name = 'input' if 'input' in column_names else column_names[0]
494 |         target_column_name = 'target' if 'target' in column_names else column_names[0]
495 |         train_on_inputs=False
496 |     else:
497 |         raise ValueError('输入文件列数不对')
498 |     print('train_on_inputs',train_on_inputs)
499 |     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
500 |     tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
501 | 
502 |     def tokenize_function(examples):
503 |         with CaptureLogger(tok_logger) as cl:
504 |             output = tokenizer([ item for item in examples[text_column_name]],truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
505 |             output['labels'] = output['input_ids'].copy()
506 |         return output
507 | 
508 |     def tokenize(prompt):
509 |         result = tokenizer(prompt,truncation=True,max_length=data_args.block_size,padding=False,return_tensors=None)
510 |         result["labels"] = result["input_ids"].copy()
511 |         return result
512 | 
513 |     def generate_and_tokenize_prompt(data_point):
514 |         input_text = data_point[input_column_name]
515 |         target_text = data_point[target_column_name]
516 |         full_prompt = input_text+target_text
517 |         tokenized_full_prompt = tokenize(full_prompt)
518 |         if not train_on_inputs:
519 |             user_prompt = input_text
520 |             tokenized_user_prompt = tokenize(user_prompt)
521 |             user_prompt_len = len(tokenized_user_prompt["input_ids"])
522 |             tokenized_full_prompt["labels"] = [
523 |                 -100
524 |             ] * user_prompt_len + tokenized_full_prompt["labels"][
525 |                 user_prompt_len:
526 |             ] 
527 |         return tokenized_full_prompt
528 |     
529 |     
530 |     
531 |     with training_args.main_process_first(desc="dataset map tokenization"):
532 |         if not data_args.streaming:
533 |             tokenized_datasets = raw_datasets.map(
534 |                 tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
535 |                 batched=True if train_on_inputs==True else False,
536 |                 num_proc=data_args.preprocessing_num_workers,
537 |                 remove_columns=column_names,
538 |                 load_from_cache_file=not data_args.overwrite_cache,
539 |                 desc="Running tokenizer on dataset",
540 |             )
541 |         else:
542 |             tokenized_datasets = raw_datasets.map(
543 |                 tokenize_function if train_on_inputs==True else generate_and_tokenize_prompt,
544 |                 batched=True if train_on_inputs==True else False,
545 |                 remove_columns=column_names,
546 |             )
547 | 
548 |     if data_args.block_size is None:
549 |         block_size = tokenizer.model_max_length
550 |         if block_size > 2048:
551 |             block_size = 2048
552 |     else:
553 |         block_size = min(data_args.block_size, tokenizer.model_max_length)
554 | 
555 |     if training_args.do_train:
556 |         if "train" not in tokenized_datasets:
557 |             raise ValueError("--do_train requires a train dataset")
558 |         train_dataset = tokenized_datasets["train"]
559 |         if data_args.max_train_samples is not None:
560 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
561 |             train_dataset = train_dataset.select(range(max_train_samples))
562 |         for index in random.sample(range(len(train_dataset)), 3):
563 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
564 |         train_dataset = train_dataset.shuffle(seed=training_args.seed)
565 | 
566 |     if training_args.do_eval:
567 |         if "validation" not in tokenized_datasets:
568 |             raise ValueError("--do_eval requires a validation dataset")
569 |         eval_dataset = tokenized_datasets["validation"]
570 |         if data_args.max_eval_samples is not None:
571 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
572 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
573 | 
574 |         def preprocess_logits_for_metrics(logits, labels):
575 |             if isinstance(logits, tuple):
576 |                 # Depending on the model and config, logits may contain extra tensors,
577 |                 # like past_key_values, but logits always come first
578 |                 logits = logits[0]
579 |             return logits.argmax(dim=-1)
580 | 
581 |         metric = evaluate.load("accuracy.py")
582 | 
583 |         def compute_metrics(eval_preds):
584 |             preds, labels = eval_preds
585 |             # preds have the same shape as the labels, after the argmax(-1) has been calculated
586 |             # by preprocess_logits_for_metrics but we need to shift the labels
587 |             labels = labels[:, 1:].reshape(-1)
588 |             # .reshape(-1)
589 |             preds = preds[:, :-1].reshape(-1)
590 |             # .reshape(-1)
591 |             # print(labels.shape)
592 |             # true_predictions = [
593 |             #     [p for (p, l) in zip(pred, gold_label) if l != -100]
594 |             #     for pred, gold_label in zip(preds, labels)
595 |             # ]
596 |             # true_labels = [
597 |             #     [l for (p, l) in zip(pred, gold_label) if l != -100]
598 |             #     for pred, gold_label in zip(preds, labels)
599 |             # ]            
600 |             # preds = np.array(true_predictions).reshape(-1)
601 |             # labels = np.array(true_labels).reshape(-1)
602 |             return metric.compute(predictions=preds, references=labels)
603 |         # layer_norm_names=[]
604 | 
605 |                 
606 |     
607 |     model = get_peft_model(model, lora_config)
608 |     model.print_trainable_parameters()
609 | 
610 |     # Initialize our Trainer
611 |     trainer = Trainer(
612 |         model=model,
613 |         args=training_args,
614 |         train_dataset=train_dataset if training_args.do_train else None,
615 |         eval_dataset=eval_dataset if training_args.do_eval else None,
616 |         tokenizer=tokenizer,
617 |         # Data collator will default to DataCollatorWithPadding, so we change it.
618 |         data_collator=transformers.DataCollatorForSeq2Seq(
619 |             tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
620 |         ),
621 |         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
622 |         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None,
623 |         callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None),
624 |     )
625 | 
626 |     # Training
627 |     if training_args.do_train:
628 |         checkpoint = None
629 |         '''if training_args.resume_from_checkpoint is not None:
630 |             resume_from_checkpoint = training_args.resume_from_checkpoint
631 |             checkpoint_name = os.path.join(resume_from_checkpoint, "pytorch_model.bin")
632 |             if not os.path.exists(checkpoint_name):
633 |                 checkpoint_name = os.path.join(
634 |                     resume_from_checkpoint, "adapter_model.bin"
635 |                 )  # only LoRA model - LoRA config above has to fit
636 |                 resume_from_checkpoint = (
637 |                     False  # So the trainer won't try loading its state
638 |                 )
639 |             # The two files above have a different name depending on how they were saved, but are actually the same.
640 |             if os.path.exists(checkpoint_name):
641 |                 print(f"Restarting from {checkpoint_name}")
642 |                 adapters_weights = torch.load(checkpoint_name)
643 |                 set_peft_model_state_dict(model, adapters_weights)
644 |             else:
645 |                 print(f"Checkpoint {checkpoint_name} not found")
646 |             # checkpoint = Fa'''
647 |         if training_args.resume_from_checkpoint is not None:
648 |             checkpoint = training_args.resume_from_checkpoint
649 |         elif last_checkpoint is not None:
650 |             checkpoint = last_checkpoint
651 |         
652 |         if torch.__version__ >= "2" and sys.platform != "win32":
653 |             model = torch.compile(model)
654 |         
655 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
656 |         trainer.save_model()  # Saves the tokenizer too for easy upload
657 | 
658 |         metrics = train_result.metrics
659 | 
660 |         max_train_samples = (
661 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
662 |         )
663 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
664 | 
665 |         trainer.log_metrics("train", metrics)
666 |         trainer.save_metrics("train", metrics)
667 |         trainer.save_state()
668 | 
669 |     # Evaluation
670 |     if training_args.do_eval:
671 |         logger.info("*** Evaluate ***")
672 | 
673 |         metrics = trainer.evaluate()
674 | 
675 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
676 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
677 |         try:
678 |             perplexity = math.exp(metrics["eval_loss"])
679 |         except OverflowError:
680 |             perplexity = float("inf")
681 |         metrics["perplexity"] = perplexity
682 | 
683 |         trainer.log_metrics("eval", metrics)
684 |         trainer.save_metrics("eval", metrics)
685 | 
686 | 
687 | 
688 | def _mp_fn(index):
689 |     # For xla_spawn (TPUs)
690 |     main()
691 | 
692 | 
693 | if __name__ == "__main__":
694 |     main()
695 | 


--------------------------------------------------------------------------------
/train/sft/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | output_model=save_folder
 2 | # 需要修改到自己的输入目录
 3 | if [ ! -d ${output_model} ];then  
 4 |     mkdir ${output_model}
 5 | fi
 6 | export CUDA_HOME=/usr/local/cuda/
 7 | export NCCL_P2P_DISABLE=1
 8 | cp ./finetune.sh ${output_model}
 9 | deepspeed --include localhost:1,0 finetune_clm_lora.py \
10 |     --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
11 |     --train_files ../../data/train_sft.csv \
12 |     --validation_files  ../../data/dev_sft.csv \
13 |                          ../../data/dev_sft_sharegpt.csv \
14 |     --per_device_train_batch_size 1 \
15 |     --per_device_eval_batch_size 1 \
16 |     --do_train \
17 |     --do_eval \
18 |     --use_fast_tokenizer false \
19 |     --output_dir ${output_model} \
20 |     --evaluation_strategy  steps \
21 |     --max_eval_samples 800 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 8 \
24 |     --num_train_epochs 10 \
25 |     --warmup_steps 400 \
26 |     --load_in_bits 4 \
27 |     --lora_r 8 \
28 |     --lora_alpha 32 \
29 |     --target_modules q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj \
30 |     --logging_dir ${output_model}/logs \
31 |     --logging_strategy steps \
32 |     --logging_steps 10 \
33 |     --save_strategy steps \
34 |     --preprocessing_num_workers 10 \
35 |     --save_steps 20 \
36 |     --eval_steps 20 \
37 |     --save_total_limit 2000 \
38 |     --seed 42 \
39 |     --disable_tqdm false \
40 |     --ddp_find_unused_parameters false \
41 |     --block_size 2048 \
42 |     --report_to tensorboard \
43 |     --overwrite_output_dir \
44 |     --deepspeed ds_config_zero2.json \
45 |     --ignore_data_skip true \
46 |     --bf16 \
47 |     --gradient_checkpointing \
48 |     --bf16_full_eval \
49 |     --ddp_timeout 18000000 \
50 |     | tee -a ${output_model}/train.log
51 |     
52 | 
53 | 
54 |     # --resume_from_checkpoint ${output_model}/checkpoint-20400 \
55 | 


--------------------------------------------------------------------------------