├── README.md ├── assets ├── 5a7795e27634a0136de66eff06e9846.png ├── 63f9554c1f57b989422357ea7a12f7a.png ├── 6faf70b7cb961d031078bd4f340cda6.png ├── 70534d69bb1f2c82f2950931d410e1e.png ├── 80b73c2789fe16bf5a5d6291a152127.png ├── a06c05a337f8631f07c11037a2add83.png ├── a261ba792e071c91cfd815e0360851a.png ├── b9228fce991201c68efcc884b824c3e.png ├── bc0ff61b6d6e8727952f2b8889ce971.png ├── c6636b3b34fc6341cec39baf6a2c6c3.png ├── c993aea2dbcf2e91ab1324ded4411c3.png ├── d9e16ec9ca04f202ec3d0a4bd1b5265.png └── f26c56bb42d4390877384f5e81c8109.png ├── configs ├── .gitkeep ├── assistant │ └── llama3_8b_instruct_qlora_assistant.py ├── llama3-agentflan │ └── llama3_8b_instruct_qlora_agentflan_3e.py ├── llama3-alpaca │ ├── README.md │ ├── llama3_8b_128k_full_alpaca_e3.py │ ├── llama3_8b_512_qlora_alpaca_e3.py │ ├── llama3_8b_8k_full_alpaca_e3.py │ ├── llama3_8b_8k_full_alpaca_e3_sp2.py │ └── llama3_8b_8k_qlora_alpaca_e3.py └── llama3-llava │ └── llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py ├── data ├── .gitkeep └── self_cognition.json ├── docs ├── agent.md ├── assistant.md ├── env.md ├── hello_world.md ├── llava.md ├── lmdeploy.md └── opencompass.md ├── docs_autodl ├── agent.md ├── assistant.md ├── env.md ├── hello_world.md ├── llava.md ├── lmdeploy.md └── opencompass.md ├── tools ├── .gitkeep ├── agent_cli_demo.py ├── agent_web_demo.py ├── convert_agentflan.py ├── gdata.py ├── internstudio_quant_web_demo.py └── internstudio_web_demo.py └── web_demo.py /README.md: -------------------------------------------------------------------------------- 1 | # Llama3-Tutorial(Llama 3 超级课堂) 2 | 3 | 带大家熟悉 Llama 3 微调、量化部署、评测全链路(基于书生·浦语大模型工具链) 4 | 5 |
6 | image 7 |
8 | 9 |
10 |
11 | 12 |
13 | 14 | | 课程 | 内容 | 资料 | 15 | | ------ | ------------------------------------------- | ----------------------------------- | 16 | | 前置知识 | VScode 远程连接开发机 | [InternStudio 文档](./docs/env.md),[autodl文档](./docs_autodl/env.md),[视频](https://www.bilibili.com/video/BV1Qi42117mb) | 17 | | 第一节 | Llama 3 本地 Web Demo 部署 | [InternStudio 文档](./docs/hello_world.md),[autodl文档](./docs_autodl/hello_world.md),[视频](https://www.bilibili.com/video/BV1Fu4m1F7FA) | 18 | | 第二节 | Llama 3 微调个人小助手认知([XTuner](https://github.com/internLM/xtuner) 版) | [InternStudio 文档](./docs/assistant.md),[autodl文档](./docs_autodl/assistant.md),[视频](https://www.bilibili.com/video/BV1Em41117RD) | 19 | | 第三节 | Llama 3 图片理解能力微调([XTuner](https://github.com/internLM/xtuner)+LLaVA 版) | [InternStudio 文档](./docs/llava.md),[autodl文档](./docs_autodl/llava.md),[视频](https://www.bilibili.com/video/BV15f421S7cg) | 20 | | 第四节 | Llama 3 高效部署实践([LMDeploy](https://github.com/internLM/lmdeploy) 版) | [InternStudio 文档](./docs/lmdeploy.md),[autodl文档](./docs_autodl/lmdeploy.md),[视频](https://www.bilibili.com/video/BV1fx4y1B74B) | 21 | | 第五节 | Llama 3 Agent 能力体验与微调 | [InternStudio 文档](./docs/agent.md),[autodl文档](./docs_autodl/agent.md),[视频](https://www.bilibili.com/video/BV1zi42117MP) | 22 | | 第六节 | Llama 3 能力评测([OpenCompass](https://github.com/open-compass/opencompass) 版) | [InternStudio 文档](./docs/opencompass.md),[autodl文档](./docs_autodl/opencompass.md),[视频](https://www.bilibili.com/video/BV19i42117yr) | 23 | 24 | 25 | 26 | 27 |
28 | 29 |
30 | image 31 |
32 | 33 | 34 | 35 | 36 |
37 | 欢迎加入 Llama 3 大讨论交流群~ 38 |
39 | 40 |
41 | 42 | 43 |
44 | image 45 |
46 | 47 | 特别感谢书生·浦语社区 A100 的算力支持,大家快给书生·浦语工具链点 Star 哟~ 48 | -------------------------------------------------------------------------------- /assets/5a7795e27634a0136de66eff06e9846.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/5a7795e27634a0136de66eff06e9846.png -------------------------------------------------------------------------------- /assets/63f9554c1f57b989422357ea7a12f7a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/63f9554c1f57b989422357ea7a12f7a.png -------------------------------------------------------------------------------- /assets/6faf70b7cb961d031078bd4f340cda6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/6faf70b7cb961d031078bd4f340cda6.png -------------------------------------------------------------------------------- /assets/70534d69bb1f2c82f2950931d410e1e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/70534d69bb1f2c82f2950931d410e1e.png -------------------------------------------------------------------------------- /assets/80b73c2789fe16bf5a5d6291a152127.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/80b73c2789fe16bf5a5d6291a152127.png -------------------------------------------------------------------------------- /assets/a06c05a337f8631f07c11037a2add83.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/a06c05a337f8631f07c11037a2add83.png -------------------------------------------------------------------------------- /assets/a261ba792e071c91cfd815e0360851a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/a261ba792e071c91cfd815e0360851a.png -------------------------------------------------------------------------------- /assets/b9228fce991201c68efcc884b824c3e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/b9228fce991201c68efcc884b824c3e.png -------------------------------------------------------------------------------- /assets/bc0ff61b6d6e8727952f2b8889ce971.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/bc0ff61b6d6e8727952f2b8889ce971.png -------------------------------------------------------------------------------- /assets/c6636b3b34fc6341cec39baf6a2c6c3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/c6636b3b34fc6341cec39baf6a2c6c3.png -------------------------------------------------------------------------------- /assets/c993aea2dbcf2e91ab1324ded4411c3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/c993aea2dbcf2e91ab1324ded4411c3.png -------------------------------------------------------------------------------- /assets/d9e16ec9ca04f202ec3d0a4bd1b5265.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/d9e16ec9ca04f202ec3d0a4bd1b5265.png -------------------------------------------------------------------------------- /assets/f26c56bb42d4390877384f5e81c8109.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/assets/f26c56bb42d4390877384f5e81c8109.png -------------------------------------------------------------------------------- /configs/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /configs/assistant/llama3_8b_instruct_qlora_assistant.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_dataset 4 | from mmengine.dataset import DefaultSampler 5 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 6 | LoggerHook, ParamSchedulerHook) 7 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 8 | from peft import LoraConfig 9 | from torch.optim import AdamW 10 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 11 | BitsAndBytesConfig) 12 | 13 | from xtuner.dataset import process_hf_dataset 14 | from xtuner.dataset.collate_fns import default_collate_fn 15 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 16 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 17 | VarlenAttnArgsToMessageHubHook) 18 | from xtuner.engine.runner import TrainLoop 19 | from xtuner.model import SupervisedFinetune 20 | from xtuner.parallel.sequence import SequenceParallelSampler 21 | from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 22 | 23 | ####################################################################### 24 | # PART 1 Settings # 25 | ####################################################################### 26 | # Model 27 | pretrained_model_name_or_path = '/root/model/Meta-Llama-3-8B-Instruct' 28 | use_varlen_attn = False 29 | 30 | # Data 31 | data_files = ['/root/Llama3-Tutorial/data/personal_assistant.json'] 32 | prompt_template = PROMPT_TEMPLATE.llama3_chat 33 | max_length = 512 34 | pack_to_max_length = True 35 | 36 | # parallel 37 | sequence_parallel_size = 1 38 | 39 | # Scheduler & Optimizer 40 | batch_size = 1 # per_device 41 | accumulative_counts = 16 42 | accumulative_counts *= sequence_parallel_size 43 | dataloader_num_workers = 0 44 | max_epochs = 3 45 | optim_type = AdamW 46 | lr = 2e-4 47 | betas = (0.9, 0.999) 48 | weight_decay = 0 49 | max_norm = 1 # grad clip 50 | warmup_ratio = 0.03 51 | 52 | # Save 53 | save_steps = 500 54 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 55 | 56 | # Evaluate the generation performance during the training 57 | evaluation_freq = 500 58 | SYSTEM = SYSTEM_TEMPLATE.alpaca 59 | evaluation_inputs = [ 60 | '你是(请用中文回答)' 61 | ] 62 | 63 | ####################################################################### 64 | # PART 2 Model & Tokenizer # 65 | ####################################################################### 66 | tokenizer = dict( 67 | type=AutoTokenizer.from_pretrained, 68 | pretrained_model_name_or_path=pretrained_model_name_or_path, 69 | trust_remote_code=True, 70 | padding_side='right') 71 | 72 | model = dict( 73 | type=SupervisedFinetune, 74 | use_varlen_attn=use_varlen_attn, 75 | llm=dict( 76 | type=AutoModelForCausalLM.from_pretrained, 77 | pretrained_model_name_or_path=pretrained_model_name_or_path, 78 | trust_remote_code=True, 79 | torch_dtype=torch.float16, 80 | quantization_config=dict( 81 | type=BitsAndBytesConfig, 82 | load_in_4bit=True, 83 | load_in_8bit=False, 84 | llm_int8_threshold=6.0, 85 | llm_int8_has_fp16_weight=False, 86 | bnb_4bit_compute_dtype=torch.float16, 87 | bnb_4bit_use_double_quant=True, 88 | bnb_4bit_quant_type='nf4')), 89 | lora=dict( 90 | type=LoraConfig, 91 | r=16, 92 | lora_alpha=16, 93 | lora_dropout=0.1, 94 | bias='none', 95 | task_type='CAUSAL_LM')) 96 | 97 | ####################################################################### 98 | # PART 3 Dataset & Dataloader # 99 | ####################################################################### 100 | alpaca_en = dict( 101 | type=process_hf_dataset, 102 | # dataset=dict(type=load_dataset, path=alpaca_en_path), 103 | dataset=dict(type=load_dataset, path='json',data_files=data_files), 104 | 105 | tokenizer=tokenizer, 106 | max_length=max_length, 107 | dataset_map_fn=None, 108 | template_map_fn=dict( 109 | type=template_map_fn_factory, template=prompt_template), 110 | remove_unused_columns=True, 111 | shuffle_before_pack=True, 112 | pack_to_max_length=pack_to_max_length, 113 | use_varlen_attn=use_varlen_attn) 114 | 115 | sampler = SequenceParallelSampler \ 116 | if sequence_parallel_size > 1 else DefaultSampler 117 | train_dataloader = dict( 118 | batch_size=batch_size, 119 | num_workers=dataloader_num_workers, 120 | dataset=alpaca_en, 121 | sampler=dict(type=sampler, shuffle=True), 122 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 123 | 124 | ####################################################################### 125 | # PART 4 Scheduler & Optimizer # 126 | ####################################################################### 127 | # optimizer 128 | optim_wrapper = dict( 129 | type=AmpOptimWrapper, 130 | optimizer=dict( 131 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 132 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 133 | accumulative_counts=accumulative_counts, 134 | loss_scale='dynamic', 135 | dtype='float16') 136 | 137 | # learning policy 138 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 139 | param_scheduler = [ 140 | dict( 141 | type=LinearLR, 142 | start_factor=1e-5, 143 | by_epoch=True, 144 | begin=0, 145 | end=warmup_ratio * max_epochs, 146 | convert_to_iter_based=True), 147 | dict( 148 | type=CosineAnnealingLR, 149 | eta_min=0.0, 150 | by_epoch=True, 151 | begin=warmup_ratio * max_epochs, 152 | end=max_epochs, 153 | convert_to_iter_based=True) 154 | ] 155 | 156 | # train, val, test setting 157 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 158 | 159 | ####################################################################### 160 | # PART 5 Runtime # 161 | ####################################################################### 162 | # Log the dialogue periodically during the training process, optional 163 | custom_hooks = [ 164 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 165 | dict( 166 | type=EvaluateChatHook, 167 | tokenizer=tokenizer, 168 | every_n_iters=evaluation_freq, 169 | evaluation_inputs=evaluation_inputs, 170 | system=SYSTEM, 171 | prompt_template=prompt_template) 172 | ] 173 | 174 | if use_varlen_attn: 175 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 176 | 177 | # configure default hooks 178 | default_hooks = dict( 179 | # record the time of every iteration. 180 | timer=dict(type=IterTimerHook), 181 | # print log every 10 iterations. 182 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 183 | # enable the parameter scheduler. 184 | param_scheduler=dict(type=ParamSchedulerHook), 185 | # save checkpoint per `save_steps`. 186 | checkpoint=dict( 187 | type=CheckpointHook, 188 | by_epoch=False, 189 | interval=save_steps, 190 | max_keep_ckpts=save_total_limit), 191 | # set sampler seed in distributed evrionment. 192 | sampler_seed=dict(type=DistSamplerSeedHook), 193 | ) 194 | 195 | # configure environment 196 | env_cfg = dict( 197 | # whether to enable cudnn benchmark 198 | cudnn_benchmark=False, 199 | # set multi process parameters 200 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 201 | # set distributed parameters 202 | dist_cfg=dict(backend='nccl'), 203 | ) 204 | 205 | # set visualizer 206 | visualizer = None 207 | 208 | # set log level 209 | log_level = 'INFO' 210 | 211 | # load from which checkpoint 212 | load_from = None 213 | 214 | # whether to resume training from the loaded checkpoint 215 | resume = False 216 | 217 | # Defaults to use random seed and disable `deterministic` 218 | randomness = dict(seed=None, deterministic=False) 219 | 220 | # set log processor 221 | log_processor = dict(by_epoch=False) 222 | -------------------------------------------------------------------------------- /configs/llama3-agentflan/llama3_8b_instruct_qlora_agentflan_3e.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_from_disk 4 | from mmengine.dataset import DefaultSampler 5 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 6 | LoggerHook, ParamSchedulerHook) 7 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 8 | from peft import LoraConfig 9 | from torch.optim import AdamW 10 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 11 | BitsAndBytesConfig) 12 | 13 | from xtuner.dataset import process_hf_dataset 14 | from xtuner.dataset.collate_fns import default_collate_fn 15 | from xtuner.dataset.map_fns import openai_map_fn, template_map_fn_factory 16 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 17 | VarlenAttnArgsToMessageHubHook, ThroughputHook) 18 | from xtuner.engine.runner import TrainLoop 19 | from xtuner.model import SupervisedFinetune 20 | from xtuner.parallel.sequence import SequenceParallelSampler 21 | from xtuner.utils import PROMPT_TEMPLATE 22 | 23 | ####################################################################### 24 | # PART 1 Settings # 25 | ####################################################################### 26 | # Model 27 | pretrained_model_name_or_path = '/root/model/Meta-Llama-3-8B-Instruct' 28 | use_varlen_attn = False 29 | 30 | # Data 31 | agent_flan_path = '~/Agent-FLAN/data_converted' 32 | prompt_template = PROMPT_TEMPLATE.llama3_chat 33 | max_length = 4096 34 | pack_to_max_length = True 35 | 36 | # parallel 37 | sequence_parallel_size = 1 38 | 39 | # Scheduler & Optimizer 40 | batch_size = 1 # per_device 41 | accumulative_counts = 16 42 | accumulative_counts *= sequence_parallel_size 43 | dataloader_num_workers = 0 44 | max_epochs = 3 45 | optim_type = AdamW 46 | lr = 2e-4 47 | betas = (0.9, 0.999) 48 | weight_decay = 0 49 | max_norm = 1 # grad clip 50 | warmup_ratio = 0.03 51 | 52 | # Save 53 | save_steps = 500 54 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 55 | 56 | # Evaluate the generation performance during the training 57 | evaluation_freq = 500 58 | SYSTEM = ( 59 | 'You are a assistant who can utilize external tools.\n' 60 | "[{{\'name\': \'ArxivSearch.get_arxiv_article_information\'," 61 | "\'description\': \'Run Arxiv search and get the article meta information.\'," 62 | "\'parameters\': [{{\'name': \'query', \'type\': \'STRING\', \'description\':" 63 | "\'the content of search query\'}}], \'required\': [\'query\'], \'return_data\':" 64 | "[{{\'name\': \'content\', \'description\': \'a list of 3 arxiv search papers\', \'type\': \'STRING\'}}]," 65 | "\'parameter_description\': \'If you call this tool, you must pass arguments in the JSON format" 66 | "{{key: value}}, where the key is the parameter name.\'}}]\n" 67 | 'To use a tool, please use the following format:\n```\n' 68 | 'Thought:Think what you need to solve, do you need to use tools?\n' 69 | "Action:the tool name, should be one of [[\'ArxivSearch\']]\n" 70 | 'Action Input:the input to the action\n```\n' 71 | 'The response after utilizing tools should using the following format:\n```\n' 72 | 'Response:the results after call the tool.\n```\n' 73 | 'If you already know the answer, or you do not need to use tools,\n' 74 | 'please using the following format to reply:\n```\n' 75 | 'Thought:the thought process to get the final answer\n' 76 | 'Final Answer:final answer\n```\nBegin!' 77 | ) 78 | 79 | evaluation_inputs = [ 80 | 'Please help me search the InternLM2 Technical Report.' 81 | ] 82 | 83 | ####################################################################### 84 | # PART 2 Model & Tokenizer # 85 | ####################################################################### 86 | tokenizer = dict( 87 | type=AutoTokenizer.from_pretrained, 88 | pretrained_model_name_or_path=pretrained_model_name_or_path, 89 | trust_remote_code=True, 90 | padding_side='right') 91 | 92 | model = dict( 93 | type=SupervisedFinetune, 94 | use_varlen_attn=use_varlen_attn, 95 | llm=dict( 96 | type=AutoModelForCausalLM.from_pretrained, 97 | pretrained_model_name_or_path=pretrained_model_name_or_path, 98 | trust_remote_code=True, 99 | torch_dtype=torch.float16, 100 | quantization_config=dict( 101 | type=BitsAndBytesConfig, 102 | load_in_4bit=True, 103 | load_in_8bit=False, 104 | llm_int8_threshold=6.0, 105 | llm_int8_has_fp16_weight=False, 106 | bnb_4bit_compute_dtype=torch.float16, 107 | bnb_4bit_use_double_quant=True, 108 | bnb_4bit_quant_type='nf4')), 109 | lora=dict( 110 | type=LoraConfig, 111 | r=64, 112 | lora_alpha=16, 113 | lora_dropout=0.1, 114 | bias='none', 115 | task_type='CAUSAL_LM')) 116 | 117 | ####################################################################### 118 | # PART 3 Dataset & Dataloader # 119 | ####################################################################### 120 | agent_flan = dict( 121 | type=process_hf_dataset, 122 | dataset=dict(type=load_from_disk, dataset_path=agent_flan_path), 123 | tokenizer=tokenizer, 124 | max_length=max_length, 125 | dataset_map_fn=openai_map_fn, 126 | template_map_fn=dict( 127 | type=template_map_fn_factory, template=prompt_template), 128 | remove_unused_columns=True, 129 | shuffle_before_pack=True, 130 | pack_to_max_length=pack_to_max_length, 131 | use_varlen_attn=use_varlen_attn) 132 | 133 | sampler = SequenceParallelSampler \ 134 | if sequence_parallel_size > 1 else DefaultSampler 135 | train_dataloader = dict( 136 | batch_size=batch_size, 137 | num_workers=dataloader_num_workers, 138 | dataset=agent_flan, 139 | sampler=dict(type=sampler, shuffle=True), 140 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 141 | 142 | ####################################################################### 143 | # PART 4 Scheduler & Optimizer # 144 | ####################################################################### 145 | # optimizer 146 | optim_wrapper = dict( 147 | type=AmpOptimWrapper, 148 | optimizer=dict( 149 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 150 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 151 | accumulative_counts=accumulative_counts, 152 | loss_scale='dynamic', 153 | dtype='float16') 154 | 155 | # learning policy 156 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 157 | param_scheduler = [ 158 | dict( 159 | type=LinearLR, 160 | start_factor=1e-5, 161 | by_epoch=True, 162 | begin=0, 163 | end=warmup_ratio * max_epochs, 164 | convert_to_iter_based=True), 165 | dict( 166 | type=CosineAnnealingLR, 167 | eta_min=0.0, 168 | by_epoch=True, 169 | begin=warmup_ratio * max_epochs, 170 | end=max_epochs, 171 | convert_to_iter_based=True) 172 | ] 173 | 174 | # train, val, test setting 175 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 176 | 177 | ####################################################################### 178 | # PART 5 Runtime # 179 | ####################################################################### 180 | # Log the dialogue periodically during the training process, optional 181 | custom_hooks = [ 182 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 183 | dict( 184 | type=EvaluateChatHook, 185 | tokenizer=tokenizer, 186 | every_n_iters=evaluation_freq, 187 | evaluation_inputs=evaluation_inputs, 188 | system=SYSTEM, 189 | prompt_template=prompt_template), 190 | dict(type=ThroughputHook) 191 | ] 192 | 193 | if use_varlen_attn: 194 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 195 | 196 | # configure default hooks 197 | default_hooks = dict( 198 | # record the time of every iteration. 199 | timer=dict(type=IterTimerHook), 200 | # print log every 10 iterations. 201 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 202 | # enable the parameter scheduler. 203 | param_scheduler=dict(type=ParamSchedulerHook), 204 | # save checkpoint per `save_steps`. 205 | checkpoint=dict( 206 | type=CheckpointHook, 207 | save_optimizer=False, 208 | by_epoch=False, 209 | interval=save_steps, 210 | max_keep_ckpts=save_total_limit), 211 | # set sampler seed in distributed evrionment. 212 | sampler_seed=dict(type=DistSamplerSeedHook), 213 | ) 214 | 215 | # configure environment 216 | env_cfg = dict( 217 | # whether to enable cudnn benchmark 218 | cudnn_benchmark=False, 219 | # set multi process parameters 220 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 221 | # set distributed parameters 222 | dist_cfg=dict(backend='nccl'), 223 | ) 224 | 225 | # set visualizer 226 | visualizer = None 227 | 228 | # set log level 229 | log_level = 'INFO' 230 | 231 | # load from which checkpoint 232 | load_from = None 233 | 234 | # whether to resume training from the loaded checkpoint 235 | resume = False 236 | 237 | # Defaults to use random seed and disable `deterministic` 238 | randomness = dict(seed=None, deterministic=False) 239 | 240 | # set log processor 241 | log_processor = dict(by_epoch=False) 242 | -------------------------------------------------------------------------------- /configs/llama3-alpaca/README.md: -------------------------------------------------------------------------------- 1 | # Llama3 8B 2 | 3 | ### 10GB 青春版 (QLoRA, 最长序列 512) 4 | ```bash 5 | xtuner train llama3_8b_512_qlora_alpaca_e3.py --deepspeed deepspeed_zero1 6 | ``` 7 | 8 | ### 24GB 满血版(QLoRA, 最长序列 8192) 9 | ```bash 10 | xtuner train llama3_8b_8k_qlora_alpaca_e3.py --deepspeed deepspeed_zero1 11 | ``` 12 | 13 | ### 2*A100 Max版(全量微调,最长序列 8192) 14 | ```bash 15 | CUDA_VISIBLE_DEVICES=0,1 16 | NPROC_PER_NODE=2 xtuner train llama3_8b_8k_full_alpaca_e3_sp2.py --deepspeed deepspeed_zero3 17 | ``` 18 | 19 | ### 8*A100 Pro Max版(全量微调,最长序列 8192) 20 | ```bash 21 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 22 | NPROC_PER_NODE=8 xtuner train llama3_8b_8k_full_alpaca_e3.py --deepspeed deepspeed_zero3 23 | ``` 24 | 25 | 26 | ### 8*A100 Ultra 版(全量微调,最长序列 128k) 27 | ```bash 28 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 29 | NPROC_PER_NODE=8 xtuner train llama3_8b_128k_full_alpaca_e3.py --deepspeed deepspeed_zero3 30 | ``` 31 | 32 | 33 | ### 全量微调测速 34 | 35 | | Model | Sequence Length | GPU Number | ZeRO | Sequence Parallel | Tokens per Second | TFLOPs | 36 | | :-------: | :-------------: | :--------: | :----: | :---------------: | :---------------: | :----: | 37 | | Llama3 8B | 8k | 2 | ZeRO-3 | 2 | 1037.0 | 76.8 | 38 | | Llama3 8B | 8k | 4 | ZeRO-3 | 1 | 2331.3 | 172.6 | 39 | | Llama3 8B | 8k | 8 | ZeRO-3 | 1 | 2771.2 | 205.1 | 40 | 41 | | Model | Sequence Length | GPU Number | ZeRO | Sequence Parallel | Tokens per Second | TFLOPs | 42 | | :-------: | :-------------: | :--------: | :----: | :---------------: | :---------------: | :----: | 43 | | Llama3 8B | 8k | 8 | ZeRO-3 | 1 | 2771.2 | 205.1 | 44 | | Llama3 8B | 16k | 8 | ZeRO-3 | 2 | 2320.7 | 191.7 | 45 | | Llama3 8B | 32k | 8 | ZeRO-3 | 4 | 1870.2 | 186.6 | 46 | | Llama3 8B | 64k | 8 | ZeRO-3 | 8 | 1356.4 | 182.0 | 47 | | Llama3 8B | 128k | 8 | ZeRO-3 | 8 | 875.7 | 177.7 | 48 | -------------------------------------------------------------------------------- /configs/llama3-alpaca/llama3_8b_128k_full_alpaca_e3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from datasets import load_dataset 3 | from mmengine.dataset import DefaultSampler 4 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 5 | LoggerHook, ParamSchedulerHook) 6 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 7 | from torch.optim import AdamW 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from xtuner.dataset import process_hf_dataset 11 | from xtuner.dataset.collate_fns import default_collate_fn 12 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 13 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 14 | VarlenAttnArgsToMessageHubHook) 15 | from xtuner.engine.runner import TrainLoop 16 | from xtuner.model import SupervisedFinetune 17 | from xtuner.parallel.sequence import SequenceParallelSampler 18 | from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 19 | 20 | ####################################################################### 21 | # PART 1 Settings # 22 | ####################################################################### 23 | # Model 24 | pretrained_model_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 25 | use_varlen_attn = False 26 | 27 | # Data 28 | alpaca_en_path = 'tatsu-lab/alpaca' 29 | prompt_template = PROMPT_TEMPLATE.llama3_chat 30 | max_length = 128 * 1024 31 | pack_to_max_length = True 32 | 33 | # parallel 34 | sequence_parallel_size = 1 35 | 36 | # Scheduler & Optimizer 37 | batch_size = 1 # per_device 38 | accumulative_counts = 16 39 | accumulative_counts *= sequence_parallel_size 40 | dataloader_num_workers = 0 41 | max_epochs = 3 42 | optim_type = AdamW 43 | lr = 2e-5 44 | betas = (0.9, 0.999) 45 | weight_decay = 0 46 | max_norm = 1 # grad clip 47 | warmup_ratio = 0.03 48 | 49 | # Save 50 | save_steps = 500 51 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 52 | 53 | # Evaluate the generation performance during the training 54 | evaluation_freq = 500 55 | SYSTEM = SYSTEM_TEMPLATE.alpaca 56 | evaluation_inputs = [ 57 | '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' 58 | ] 59 | 60 | ####################################################################### 61 | # PART 2 Model & Tokenizer # 62 | ####################################################################### 63 | tokenizer = dict( 64 | type=AutoTokenizer.from_pretrained, 65 | pretrained_model_name_or_path=pretrained_model_name_or_path, 66 | trust_remote_code=True, 67 | padding_side='right') 68 | 69 | model = dict( 70 | type=SupervisedFinetune, 71 | use_varlen_attn=use_varlen_attn, 72 | llm=dict( 73 | type=AutoModelForCausalLM.from_pretrained, 74 | pretrained_model_name_or_path=pretrained_model_name_or_path, 75 | trust_remote_code=True)) 76 | 77 | ####################################################################### 78 | # PART 3 Dataset & Dataloader # 79 | ####################################################################### 80 | alpaca_en = dict( 81 | type=process_hf_dataset, 82 | dataset=dict(type=load_dataset, path=alpaca_en_path), 83 | tokenizer=tokenizer, 84 | max_length=max_length, 85 | dataset_map_fn=alpaca_map_fn, 86 | template_map_fn=dict( 87 | type=template_map_fn_factory, template=prompt_template), 88 | remove_unused_columns=True, 89 | shuffle_before_pack=True, 90 | pack_to_max_length=pack_to_max_length, 91 | use_varlen_attn=use_varlen_attn) 92 | 93 | sampler = SequenceParallelSampler \ 94 | if sequence_parallel_size > 1 else DefaultSampler 95 | train_dataloader = dict( 96 | batch_size=batch_size, 97 | num_workers=dataloader_num_workers, 98 | dataset=alpaca_en, 99 | sampler=dict(type=sampler, shuffle=True), 100 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 101 | 102 | ####################################################################### 103 | # PART 4 Scheduler & Optimizer # 104 | ####################################################################### 105 | # optimizer 106 | optim_wrapper = dict( 107 | type=AmpOptimWrapper, 108 | optimizer=dict( 109 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 110 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 111 | accumulative_counts=accumulative_counts, 112 | loss_scale='dynamic', 113 | dtype='float16') 114 | 115 | # learning policy 116 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 117 | param_scheduler = [ 118 | dict( 119 | type=LinearLR, 120 | start_factor=1e-5, 121 | by_epoch=True, 122 | begin=0, 123 | end=warmup_ratio * max_epochs, 124 | convert_to_iter_based=True), 125 | dict( 126 | type=CosineAnnealingLR, 127 | eta_min=0.0, 128 | by_epoch=True, 129 | begin=warmup_ratio * max_epochs, 130 | end=max_epochs, 131 | convert_to_iter_based=True) 132 | ] 133 | 134 | # train, val, test setting 135 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 136 | 137 | ####################################################################### 138 | # PART 5 Runtime # 139 | ####################################################################### 140 | # Log the dialogue periodically during the training process, optional 141 | custom_hooks = [ 142 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 143 | dict( 144 | type=EvaluateChatHook, 145 | tokenizer=tokenizer, 146 | every_n_iters=evaluation_freq, 147 | evaluation_inputs=evaluation_inputs, 148 | system=SYSTEM, 149 | prompt_template=prompt_template) 150 | ] 151 | 152 | if use_varlen_attn: 153 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 154 | 155 | # configure default hooks 156 | default_hooks = dict( 157 | # record the time of every iteration. 158 | timer=dict(type=IterTimerHook), 159 | # print log every 10 iterations. 160 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 161 | # enable the parameter scheduler. 162 | param_scheduler=dict(type=ParamSchedulerHook), 163 | # save checkpoint per `save_steps`. 164 | checkpoint=dict( 165 | type=CheckpointHook, 166 | by_epoch=False, 167 | interval=save_steps, 168 | max_keep_ckpts=save_total_limit), 169 | # set sampler seed in distributed evrionment. 170 | sampler_seed=dict(type=DistSamplerSeedHook), 171 | ) 172 | 173 | # configure environment 174 | env_cfg = dict( 175 | # whether to enable cudnn benchmark 176 | cudnn_benchmark=False, 177 | # set multi process parameters 178 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 179 | # set distributed parameters 180 | dist_cfg=dict(backend='nccl'), 181 | ) 182 | 183 | # set visualizer 184 | visualizer = None 185 | 186 | # set log level 187 | log_level = 'INFO' 188 | 189 | # load from which checkpoint 190 | load_from = None 191 | 192 | # whether to resume training from the loaded checkpoint 193 | resume = False 194 | 195 | # Defaults to use random seed and disable `deterministic` 196 | randomness = dict(seed=None, deterministic=False) 197 | 198 | # set log processor 199 | log_processor = dict(by_epoch=False) 200 | -------------------------------------------------------------------------------- /configs/llama3-alpaca/llama3_8b_512_qlora_alpaca_e3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_dataset 4 | from mmengine.dataset import DefaultSampler 5 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 6 | LoggerHook, ParamSchedulerHook) 7 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 8 | from peft import LoraConfig 9 | from torch.optim import AdamW 10 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 11 | BitsAndBytesConfig) 12 | 13 | from xtuner.dataset import process_hf_dataset 14 | from xtuner.dataset.collate_fns import default_collate_fn 15 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 16 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 17 | VarlenAttnArgsToMessageHubHook) 18 | from xtuner.engine.runner import TrainLoop 19 | from xtuner.model import SupervisedFinetune 20 | from xtuner.parallel.sequence import SequenceParallelSampler 21 | from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 22 | 23 | ####################################################################### 24 | # PART 1 Settings # 25 | ####################################################################### 26 | # Model 27 | pretrained_model_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 28 | use_varlen_attn = False 29 | 30 | # Data 31 | alpaca_en_path = 'tatsu-lab/alpaca' 32 | prompt_template = PROMPT_TEMPLATE.llama3_chat 33 | max_length = 512 34 | pack_to_max_length = True 35 | 36 | # parallel 37 | sequence_parallel_size = 1 38 | 39 | # Scheduler & Optimizer 40 | batch_size = 1 # per_device 41 | accumulative_counts = 16 42 | accumulative_counts *= sequence_parallel_size 43 | dataloader_num_workers = 0 44 | max_epochs = 3 45 | optim_type = AdamW 46 | lr = 2e-4 47 | betas = (0.9, 0.999) 48 | weight_decay = 0 49 | max_norm = 1 # grad clip 50 | warmup_ratio = 0.03 51 | 52 | # Save 53 | save_steps = 500 54 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 55 | 56 | # Evaluate the generation performance during the training 57 | evaluation_freq = 500 58 | SYSTEM = SYSTEM_TEMPLATE.alpaca 59 | evaluation_inputs = [ 60 | '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' 61 | ] 62 | 63 | ####################################################################### 64 | # PART 2 Model & Tokenizer # 65 | ####################################################################### 66 | tokenizer = dict( 67 | type=AutoTokenizer.from_pretrained, 68 | pretrained_model_name_or_path=pretrained_model_name_or_path, 69 | trust_remote_code=True, 70 | padding_side='right') 71 | 72 | model = dict( 73 | type=SupervisedFinetune, 74 | use_varlen_attn=use_varlen_attn, 75 | llm=dict( 76 | type=AutoModelForCausalLM.from_pretrained, 77 | pretrained_model_name_or_path=pretrained_model_name_or_path, 78 | trust_remote_code=True, 79 | torch_dtype=torch.float16, 80 | quantization_config=dict( 81 | type=BitsAndBytesConfig, 82 | load_in_4bit=True, 83 | load_in_8bit=False, 84 | llm_int8_threshold=6.0, 85 | llm_int8_has_fp16_weight=False, 86 | bnb_4bit_compute_dtype=torch.float16, 87 | bnb_4bit_use_double_quant=True, 88 | bnb_4bit_quant_type='nf4')), 89 | lora=dict( 90 | type=LoraConfig, 91 | r=16, 92 | lora_alpha=16, 93 | lora_dropout=0.1, 94 | bias='none', 95 | task_type='CAUSAL_LM')) 96 | 97 | ####################################################################### 98 | # PART 3 Dataset & Dataloader # 99 | ####################################################################### 100 | alpaca_en = dict( 101 | type=process_hf_dataset, 102 | dataset=dict(type=load_dataset, path=alpaca_en_path), 103 | tokenizer=tokenizer, 104 | max_length=max_length, 105 | dataset_map_fn=alpaca_map_fn, 106 | template_map_fn=dict( 107 | type=template_map_fn_factory, template=prompt_template), 108 | remove_unused_columns=True, 109 | shuffle_before_pack=True, 110 | pack_to_max_length=pack_to_max_length, 111 | use_varlen_attn=use_varlen_attn) 112 | 113 | sampler = SequenceParallelSampler \ 114 | if sequence_parallel_size > 1 else DefaultSampler 115 | train_dataloader = dict( 116 | batch_size=batch_size, 117 | num_workers=dataloader_num_workers, 118 | dataset=alpaca_en, 119 | sampler=dict(type=sampler, shuffle=True), 120 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 121 | 122 | ####################################################################### 123 | # PART 4 Scheduler & Optimizer # 124 | ####################################################################### 125 | # optimizer 126 | optim_wrapper = dict( 127 | type=AmpOptimWrapper, 128 | optimizer=dict( 129 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 130 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 131 | accumulative_counts=accumulative_counts, 132 | loss_scale='dynamic', 133 | dtype='float16') 134 | 135 | # learning policy 136 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 137 | param_scheduler = [ 138 | dict( 139 | type=LinearLR, 140 | start_factor=1e-5, 141 | by_epoch=True, 142 | begin=0, 143 | end=warmup_ratio * max_epochs, 144 | convert_to_iter_based=True), 145 | dict( 146 | type=CosineAnnealingLR, 147 | eta_min=0.0, 148 | by_epoch=True, 149 | begin=warmup_ratio * max_epochs, 150 | end=max_epochs, 151 | convert_to_iter_based=True) 152 | ] 153 | 154 | # train, val, test setting 155 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 156 | 157 | ####################################################################### 158 | # PART 5 Runtime # 159 | ####################################################################### 160 | # Log the dialogue periodically during the training process, optional 161 | custom_hooks = [ 162 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 163 | dict( 164 | type=EvaluateChatHook, 165 | tokenizer=tokenizer, 166 | every_n_iters=evaluation_freq, 167 | evaluation_inputs=evaluation_inputs, 168 | system=SYSTEM, 169 | prompt_template=prompt_template) 170 | ] 171 | 172 | if use_varlen_attn: 173 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 174 | 175 | # configure default hooks 176 | default_hooks = dict( 177 | # record the time of every iteration. 178 | timer=dict(type=IterTimerHook), 179 | # print log every 10 iterations. 180 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 181 | # enable the parameter scheduler. 182 | param_scheduler=dict(type=ParamSchedulerHook), 183 | # save checkpoint per `save_steps`. 184 | checkpoint=dict( 185 | type=CheckpointHook, 186 | by_epoch=False, 187 | interval=save_steps, 188 | max_keep_ckpts=save_total_limit), 189 | # set sampler seed in distributed evrionment. 190 | sampler_seed=dict(type=DistSamplerSeedHook), 191 | ) 192 | 193 | # configure environment 194 | env_cfg = dict( 195 | # whether to enable cudnn benchmark 196 | cudnn_benchmark=False, 197 | # set multi process parameters 198 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 199 | # set distributed parameters 200 | dist_cfg=dict(backend='nccl'), 201 | ) 202 | 203 | # set visualizer 204 | visualizer = None 205 | 206 | # set log level 207 | log_level = 'INFO' 208 | 209 | # load from which checkpoint 210 | load_from = None 211 | 212 | # whether to resume training from the loaded checkpoint 213 | resume = False 214 | 215 | # Defaults to use random seed and disable `deterministic` 216 | randomness = dict(seed=None, deterministic=False) 217 | 218 | # set log processor 219 | log_processor = dict(by_epoch=False) 220 | -------------------------------------------------------------------------------- /configs/llama3-alpaca/llama3_8b_8k_full_alpaca_e3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from datasets import load_dataset 3 | from mmengine.dataset import DefaultSampler 4 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 5 | LoggerHook, ParamSchedulerHook) 6 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 7 | from torch.optim import AdamW 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from xtuner.dataset import process_hf_dataset 11 | from xtuner.dataset.collate_fns import default_collate_fn 12 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 13 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 14 | VarlenAttnArgsToMessageHubHook) 15 | from xtuner.engine.runner import TrainLoop 16 | from xtuner.model import SupervisedFinetune 17 | from xtuner.parallel.sequence import SequenceParallelSampler 18 | from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 19 | 20 | ####################################################################### 21 | # PART 1 Settings # 22 | ####################################################################### 23 | # Model 24 | pretrained_model_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 25 | use_varlen_attn = False 26 | 27 | # Data 28 | alpaca_en_path = 'tatsu-lab/alpaca' 29 | prompt_template = PROMPT_TEMPLATE.llama3_chat 30 | max_length = 8192 31 | pack_to_max_length = True 32 | 33 | # parallel 34 | sequence_parallel_size = 1 35 | 36 | # Scheduler & Optimizer 37 | batch_size = 1 # per_device 38 | accumulative_counts = 16 39 | accumulative_counts *= sequence_parallel_size 40 | dataloader_num_workers = 0 41 | max_epochs = 3 42 | optim_type = AdamW 43 | lr = 2e-5 44 | betas = (0.9, 0.999) 45 | weight_decay = 0 46 | max_norm = 1 # grad clip 47 | warmup_ratio = 0.03 48 | 49 | # Save 50 | save_steps = 500 51 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 52 | 53 | # Evaluate the generation performance during the training 54 | evaluation_freq = 500 55 | SYSTEM = SYSTEM_TEMPLATE.alpaca 56 | evaluation_inputs = [ 57 | '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' 58 | ] 59 | 60 | ####################################################################### 61 | # PART 2 Model & Tokenizer # 62 | ####################################################################### 63 | tokenizer = dict( 64 | type=AutoTokenizer.from_pretrained, 65 | pretrained_model_name_or_path=pretrained_model_name_or_path, 66 | trust_remote_code=True, 67 | padding_side='right') 68 | 69 | model = dict( 70 | type=SupervisedFinetune, 71 | use_varlen_attn=use_varlen_attn, 72 | llm=dict( 73 | type=AutoModelForCausalLM.from_pretrained, 74 | pretrained_model_name_or_path=pretrained_model_name_or_path, 75 | trust_remote_code=True)) 76 | 77 | ####################################################################### 78 | # PART 3 Dataset & Dataloader # 79 | ####################################################################### 80 | alpaca_en = dict( 81 | type=process_hf_dataset, 82 | dataset=dict(type=load_dataset, path=alpaca_en_path), 83 | tokenizer=tokenizer, 84 | max_length=max_length, 85 | dataset_map_fn=alpaca_map_fn, 86 | template_map_fn=dict( 87 | type=template_map_fn_factory, template=prompt_template), 88 | remove_unused_columns=True, 89 | shuffle_before_pack=True, 90 | pack_to_max_length=pack_to_max_length, 91 | use_varlen_attn=use_varlen_attn) 92 | 93 | sampler = SequenceParallelSampler \ 94 | if sequence_parallel_size > 1 else DefaultSampler 95 | train_dataloader = dict( 96 | batch_size=batch_size, 97 | num_workers=dataloader_num_workers, 98 | dataset=alpaca_en, 99 | sampler=dict(type=sampler, shuffle=True), 100 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 101 | 102 | ####################################################################### 103 | # PART 4 Scheduler & Optimizer # 104 | ####################################################################### 105 | # optimizer 106 | optim_wrapper = dict( 107 | type=AmpOptimWrapper, 108 | optimizer=dict( 109 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 110 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 111 | accumulative_counts=accumulative_counts, 112 | loss_scale='dynamic', 113 | dtype='float16') 114 | 115 | # learning policy 116 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 117 | param_scheduler = [ 118 | dict( 119 | type=LinearLR, 120 | start_factor=1e-5, 121 | by_epoch=True, 122 | begin=0, 123 | end=warmup_ratio * max_epochs, 124 | convert_to_iter_based=True), 125 | dict( 126 | type=CosineAnnealingLR, 127 | eta_min=0.0, 128 | by_epoch=True, 129 | begin=warmup_ratio * max_epochs, 130 | end=max_epochs, 131 | convert_to_iter_based=True) 132 | ] 133 | 134 | # train, val, test setting 135 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 136 | 137 | ####################################################################### 138 | # PART 5 Runtime # 139 | ####################################################################### 140 | # Log the dialogue periodically during the training process, optional 141 | custom_hooks = [ 142 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 143 | dict( 144 | type=EvaluateChatHook, 145 | tokenizer=tokenizer, 146 | every_n_iters=evaluation_freq, 147 | evaluation_inputs=evaluation_inputs, 148 | system=SYSTEM, 149 | prompt_template=prompt_template) 150 | ] 151 | 152 | if use_varlen_attn: 153 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 154 | 155 | # configure default hooks 156 | default_hooks = dict( 157 | # record the time of every iteration. 158 | timer=dict(type=IterTimerHook), 159 | # print log every 10 iterations. 160 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 161 | # enable the parameter scheduler. 162 | param_scheduler=dict(type=ParamSchedulerHook), 163 | # save checkpoint per `save_steps`. 164 | checkpoint=dict( 165 | type=CheckpointHook, 166 | by_epoch=False, 167 | interval=save_steps, 168 | max_keep_ckpts=save_total_limit), 169 | # set sampler seed in distributed evrionment. 170 | sampler_seed=dict(type=DistSamplerSeedHook), 171 | ) 172 | 173 | # configure environment 174 | env_cfg = dict( 175 | # whether to enable cudnn benchmark 176 | cudnn_benchmark=False, 177 | # set multi process parameters 178 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 179 | # set distributed parameters 180 | dist_cfg=dict(backend='nccl'), 181 | ) 182 | 183 | # set visualizer 184 | visualizer = None 185 | 186 | # set log level 187 | log_level = 'INFO' 188 | 189 | # load from which checkpoint 190 | load_from = None 191 | 192 | # whether to resume training from the loaded checkpoint 193 | resume = False 194 | 195 | # Defaults to use random seed and disable `deterministic` 196 | randomness = dict(seed=None, deterministic=False) 197 | 198 | # set log processor 199 | log_processor = dict(by_epoch=False) 200 | -------------------------------------------------------------------------------- /configs/llama3-alpaca/llama3_8b_8k_full_alpaca_e3_sp2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from datasets import load_dataset 3 | from mmengine.dataset import DefaultSampler 4 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 5 | LoggerHook, ParamSchedulerHook) 6 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 7 | from torch.optim import AdamW 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from xtuner.dataset import process_hf_dataset 11 | from xtuner.dataset.collate_fns import default_collate_fn 12 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 13 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 14 | VarlenAttnArgsToMessageHubHook) 15 | from xtuner.engine.runner import TrainLoop 16 | from xtuner.model import SupervisedFinetune 17 | from xtuner.parallel.sequence import SequenceParallelSampler 18 | from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 19 | 20 | ####################################################################### 21 | # PART 1 Settings # 22 | ####################################################################### 23 | # Model 24 | pretrained_model_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 25 | use_varlen_attn = False 26 | 27 | # Data 28 | alpaca_en_path = 'tatsu-lab/alpaca' 29 | prompt_template = PROMPT_TEMPLATE.llama3_chat 30 | max_length = 8192 31 | pack_to_max_length = True 32 | 33 | # parallel 34 | sequence_parallel_size = 2 35 | 36 | # Scheduler & Optimizer 37 | batch_size = 1 # per_device 38 | accumulative_counts = 16 39 | accumulative_counts *= sequence_parallel_size 40 | dataloader_num_workers = 0 41 | max_epochs = 3 42 | optim_type = AdamW 43 | lr = 2e-5 44 | betas = (0.9, 0.999) 45 | weight_decay = 0 46 | max_norm = 1 # grad clip 47 | warmup_ratio = 0.03 48 | 49 | # Save 50 | save_steps = 500 51 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 52 | 53 | # Evaluate the generation performance during the training 54 | evaluation_freq = 500 55 | SYSTEM = SYSTEM_TEMPLATE.alpaca 56 | evaluation_inputs = [ 57 | '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' 58 | ] 59 | 60 | ####################################################################### 61 | # PART 2 Model & Tokenizer # 62 | ####################################################################### 63 | tokenizer = dict( 64 | type=AutoTokenizer.from_pretrained, 65 | pretrained_model_name_or_path=pretrained_model_name_or_path, 66 | trust_remote_code=True, 67 | padding_side='right') 68 | 69 | model = dict( 70 | type=SupervisedFinetune, 71 | use_varlen_attn=use_varlen_attn, 72 | llm=dict( 73 | type=AutoModelForCausalLM.from_pretrained, 74 | pretrained_model_name_or_path=pretrained_model_name_or_path, 75 | trust_remote_code=True)) 76 | 77 | ####################################################################### 78 | # PART 3 Dataset & Dataloader # 79 | ####################################################################### 80 | alpaca_en = dict( 81 | type=process_hf_dataset, 82 | dataset=dict(type=load_dataset, path=alpaca_en_path), 83 | tokenizer=tokenizer, 84 | max_length=max_length, 85 | dataset_map_fn=alpaca_map_fn, 86 | template_map_fn=dict( 87 | type=template_map_fn_factory, template=prompt_template), 88 | remove_unused_columns=True, 89 | shuffle_before_pack=True, 90 | pack_to_max_length=pack_to_max_length, 91 | use_varlen_attn=use_varlen_attn) 92 | 93 | sampler = SequenceParallelSampler \ 94 | if sequence_parallel_size > 1 else DefaultSampler 95 | train_dataloader = dict( 96 | batch_size=batch_size, 97 | num_workers=dataloader_num_workers, 98 | dataset=alpaca_en, 99 | sampler=dict(type=sampler, shuffle=True), 100 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 101 | 102 | ####################################################################### 103 | # PART 4 Scheduler & Optimizer # 104 | ####################################################################### 105 | # optimizer 106 | optim_wrapper = dict( 107 | type=AmpOptimWrapper, 108 | optimizer=dict( 109 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 110 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 111 | accumulative_counts=accumulative_counts, 112 | loss_scale='dynamic', 113 | dtype='float16') 114 | 115 | # learning policy 116 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 117 | param_scheduler = [ 118 | dict( 119 | type=LinearLR, 120 | start_factor=1e-5, 121 | by_epoch=True, 122 | begin=0, 123 | end=warmup_ratio * max_epochs, 124 | convert_to_iter_based=True), 125 | dict( 126 | type=CosineAnnealingLR, 127 | eta_min=0.0, 128 | by_epoch=True, 129 | begin=warmup_ratio * max_epochs, 130 | end=max_epochs, 131 | convert_to_iter_based=True) 132 | ] 133 | 134 | # train, val, test setting 135 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 136 | 137 | ####################################################################### 138 | # PART 5 Runtime # 139 | ####################################################################### 140 | # Log the dialogue periodically during the training process, optional 141 | custom_hooks = [ 142 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 143 | dict( 144 | type=EvaluateChatHook, 145 | tokenizer=tokenizer, 146 | every_n_iters=evaluation_freq, 147 | evaluation_inputs=evaluation_inputs, 148 | system=SYSTEM, 149 | prompt_template=prompt_template) 150 | ] 151 | 152 | if use_varlen_attn: 153 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 154 | 155 | # configure default hooks 156 | default_hooks = dict( 157 | # record the time of every iteration. 158 | timer=dict(type=IterTimerHook), 159 | # print log every 10 iterations. 160 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 161 | # enable the parameter scheduler. 162 | param_scheduler=dict(type=ParamSchedulerHook), 163 | # save checkpoint per `save_steps`. 164 | checkpoint=dict( 165 | type=CheckpointHook, 166 | by_epoch=False, 167 | interval=save_steps, 168 | max_keep_ckpts=save_total_limit), 169 | # set sampler seed in distributed evrionment. 170 | sampler_seed=dict(type=DistSamplerSeedHook), 171 | ) 172 | 173 | # configure environment 174 | env_cfg = dict( 175 | # whether to enable cudnn benchmark 176 | cudnn_benchmark=False, 177 | # set multi process parameters 178 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 179 | # set distributed parameters 180 | dist_cfg=dict(backend='nccl'), 181 | ) 182 | 183 | # set visualizer 184 | visualizer = None 185 | 186 | # set log level 187 | log_level = 'INFO' 188 | 189 | # load from which checkpoint 190 | load_from = None 191 | 192 | # whether to resume training from the loaded checkpoint 193 | resume = False 194 | 195 | # Defaults to use random seed and disable `deterministic` 196 | randomness = dict(seed=None, deterministic=False) 197 | 198 | # set log processor 199 | log_processor = dict(by_epoch=False) 200 | -------------------------------------------------------------------------------- /configs/llama3-alpaca/llama3_8b_8k_qlora_alpaca_e3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from datasets import load_dataset 4 | from mmengine.dataset import DefaultSampler 5 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 6 | LoggerHook, ParamSchedulerHook) 7 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 8 | from peft import LoraConfig 9 | from torch.optim import AdamW 10 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 11 | BitsAndBytesConfig) 12 | 13 | from xtuner.dataset import process_hf_dataset 14 | from xtuner.dataset.collate_fns import default_collate_fn 15 | from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory 16 | from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, 17 | VarlenAttnArgsToMessageHubHook) 18 | from xtuner.engine.runner import TrainLoop 19 | from xtuner.model import SupervisedFinetune 20 | from xtuner.parallel.sequence import SequenceParallelSampler 21 | from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 22 | 23 | ####################################################################### 24 | # PART 1 Settings # 25 | ####################################################################### 26 | # Model 27 | pretrained_model_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 28 | use_varlen_attn = False 29 | 30 | # Data 31 | alpaca_en_path = 'tatsu-lab/alpaca' 32 | prompt_template = PROMPT_TEMPLATE.llama3_chat 33 | max_length = 8192 34 | pack_to_max_length = True 35 | 36 | # parallel 37 | sequence_parallel_size = 1 38 | 39 | # Scheduler & Optimizer 40 | batch_size = 1 # per_device 41 | accumulative_counts = 16 42 | accumulative_counts *= sequence_parallel_size 43 | dataloader_num_workers = 0 44 | max_epochs = 3 45 | optim_type = AdamW 46 | lr = 2e-4 47 | betas = (0.9, 0.999) 48 | weight_decay = 0 49 | max_norm = 1 # grad clip 50 | warmup_ratio = 0.03 51 | 52 | # Save 53 | save_steps = 500 54 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 55 | 56 | # Evaluate the generation performance during the training 57 | evaluation_freq = 500 58 | SYSTEM = SYSTEM_TEMPLATE.alpaca 59 | evaluation_inputs = [ 60 | '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' 61 | ] 62 | 63 | ####################################################################### 64 | # PART 2 Model & Tokenizer # 65 | ####################################################################### 66 | tokenizer = dict( 67 | type=AutoTokenizer.from_pretrained, 68 | pretrained_model_name_or_path=pretrained_model_name_or_path, 69 | trust_remote_code=True, 70 | padding_side='right') 71 | 72 | model = dict( 73 | type=SupervisedFinetune, 74 | use_varlen_attn=use_varlen_attn, 75 | llm=dict( 76 | type=AutoModelForCausalLM.from_pretrained, 77 | pretrained_model_name_or_path=pretrained_model_name_or_path, 78 | trust_remote_code=True, 79 | torch_dtype=torch.float16, 80 | quantization_config=dict( 81 | type=BitsAndBytesConfig, 82 | load_in_4bit=True, 83 | load_in_8bit=False, 84 | llm_int8_threshold=6.0, 85 | llm_int8_has_fp16_weight=False, 86 | bnb_4bit_compute_dtype=torch.float16, 87 | bnb_4bit_use_double_quant=True, 88 | bnb_4bit_quant_type='nf4')), 89 | lora=dict( 90 | type=LoraConfig, 91 | r=16, 92 | lora_alpha=16, 93 | lora_dropout=0.1, 94 | bias='none', 95 | task_type='CAUSAL_LM')) 96 | 97 | ####################################################################### 98 | # PART 3 Dataset & Dataloader # 99 | ####################################################################### 100 | alpaca_en = dict( 101 | type=process_hf_dataset, 102 | dataset=dict(type=load_dataset, path=alpaca_en_path), 103 | tokenizer=tokenizer, 104 | max_length=max_length, 105 | dataset_map_fn=alpaca_map_fn, 106 | template_map_fn=dict( 107 | type=template_map_fn_factory, template=prompt_template), 108 | remove_unused_columns=True, 109 | shuffle_before_pack=True, 110 | pack_to_max_length=pack_to_max_length, 111 | use_varlen_attn=use_varlen_attn) 112 | 113 | sampler = SequenceParallelSampler \ 114 | if sequence_parallel_size > 1 else DefaultSampler 115 | train_dataloader = dict( 116 | batch_size=batch_size, 117 | num_workers=dataloader_num_workers, 118 | dataset=alpaca_en, 119 | sampler=dict(type=sampler, shuffle=True), 120 | collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) 121 | 122 | ####################################################################### 123 | # PART 4 Scheduler & Optimizer # 124 | ####################################################################### 125 | # optimizer 126 | optim_wrapper = dict( 127 | type=AmpOptimWrapper, 128 | optimizer=dict( 129 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 130 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 131 | accumulative_counts=accumulative_counts, 132 | loss_scale='dynamic', 133 | dtype='float16') 134 | 135 | # learning policy 136 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 137 | param_scheduler = [ 138 | dict( 139 | type=LinearLR, 140 | start_factor=1e-5, 141 | by_epoch=True, 142 | begin=0, 143 | end=warmup_ratio * max_epochs, 144 | convert_to_iter_based=True), 145 | dict( 146 | type=CosineAnnealingLR, 147 | eta_min=0.0, 148 | by_epoch=True, 149 | begin=warmup_ratio * max_epochs, 150 | end=max_epochs, 151 | convert_to_iter_based=True) 152 | ] 153 | 154 | # train, val, test setting 155 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 156 | 157 | ####################################################################### 158 | # PART 5 Runtime # 159 | ####################################################################### 160 | # Log the dialogue periodically during the training process, optional 161 | custom_hooks = [ 162 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 163 | dict( 164 | type=EvaluateChatHook, 165 | tokenizer=tokenizer, 166 | every_n_iters=evaluation_freq, 167 | evaluation_inputs=evaluation_inputs, 168 | system=SYSTEM, 169 | prompt_template=prompt_template) 170 | ] 171 | 172 | if use_varlen_attn: 173 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 174 | 175 | # configure default hooks 176 | default_hooks = dict( 177 | # record the time of every iteration. 178 | timer=dict(type=IterTimerHook), 179 | # print log every 10 iterations. 180 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 181 | # enable the parameter scheduler. 182 | param_scheduler=dict(type=ParamSchedulerHook), 183 | # save checkpoint per `save_steps`. 184 | checkpoint=dict( 185 | type=CheckpointHook, 186 | by_epoch=False, 187 | interval=save_steps, 188 | max_keep_ckpts=save_total_limit), 189 | # set sampler seed in distributed evrionment. 190 | sampler_seed=dict(type=DistSamplerSeedHook), 191 | ) 192 | 193 | # configure environment 194 | env_cfg = dict( 195 | # whether to enable cudnn benchmark 196 | cudnn_benchmark=False, 197 | # set multi process parameters 198 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 199 | # set distributed parameters 200 | dist_cfg=dict(backend='nccl'), 201 | ) 202 | 203 | # set visualizer 204 | visualizer = None 205 | 206 | # set log level 207 | log_level = 'INFO' 208 | 209 | # load from which checkpoint 210 | load_from = None 211 | 212 | # whether to resume training from the loaded checkpoint 213 | resume = False 214 | 215 | # Defaults to use random seed and disable `deterministic` 216 | randomness = dict(seed=None, deterministic=False) 217 | 218 | # set log processor 219 | log_processor = dict(by_epoch=False) 220 | -------------------------------------------------------------------------------- /configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 4 | LoggerHook, ParamSchedulerHook) 5 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 6 | from peft import LoraConfig 7 | from torch.optim import AdamW 8 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 9 | BitsAndBytesConfig, CLIPImageProcessor, 10 | CLIPVisionModel) 11 | 12 | from xtuner.dataset import LLaVADataset 13 | from xtuner.dataset.collate_fns import default_collate_fn 14 | from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory 15 | from xtuner.dataset.samplers import LengthGroupedSampler 16 | from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook 17 | from xtuner.engine.runner import TrainLoop 18 | from xtuner.model import LLaVAModel 19 | from xtuner.utils import PROMPT_TEMPLATE 20 | 21 | ####################################################################### 22 | # PART 1 Settings # 23 | ####################################################################### 24 | # Model 25 | llm_name_or_path = '/root/model/Meta-Llama-3-8B-Instruct' 26 | visual_encoder_name_or_path = '/root/model/clip-vit-large-patch14-336' 27 | # Specify the pretrained pth 28 | pretrained_pth = '/root/model/llama3-llava-iter_2181.pth' # noqa: E501 29 | 30 | # Data 31 | data_root = '/root/tutorial/xtuner/llava/llava_data/' 32 | data_path = data_root + 'repeated_data.json' 33 | image_folder = data_root 34 | prompt_template = PROMPT_TEMPLATE.llama3_chat 35 | max_length = int(2048 - (336 / 14)**2) 36 | 37 | # Scheduler & Optimizer 38 | batch_size = 1 # per_device 39 | accumulative_counts = 1 40 | dataloader_num_workers = 0 41 | max_epochs = 1 42 | optim_type = AdamW 43 | lr = 2e-4 44 | betas = (0.9, 0.999) 45 | weight_decay = 0 46 | max_norm = 1 # grad clip 47 | warmup_ratio = 0.03 48 | 49 | # Save 50 | save_steps = 500 51 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 52 | 53 | # Evaluate the generation performance during the training 54 | evaluation_freq = 500 55 | SYSTEM = '' 56 | evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' 57 | evaluation_inputs = ['Please describe this picture','What is the equipment in the image?'] 58 | 59 | ####################################################################### 60 | # PART 2 Model & Tokenizer & Image Processor # 61 | ####################################################################### 62 | tokenizer = dict( 63 | type=AutoTokenizer.from_pretrained, 64 | pretrained_model_name_or_path=llm_name_or_path, 65 | trust_remote_code=True, 66 | padding_side='right') 67 | 68 | image_processor = dict( 69 | type=CLIPImageProcessor.from_pretrained, 70 | pretrained_model_name_or_path=visual_encoder_name_or_path, 71 | trust_remote_code=True) 72 | 73 | model = dict( 74 | type=LLaVAModel, 75 | freeze_llm=True, 76 | freeze_visual_encoder=True, 77 | pretrained_pth=pretrained_pth, 78 | llm=dict( 79 | type=AutoModelForCausalLM.from_pretrained, 80 | pretrained_model_name_or_path=llm_name_or_path, 81 | trust_remote_code=True, 82 | torch_dtype=torch.float16, 83 | quantization_config=dict( 84 | type=BitsAndBytesConfig, 85 | load_in_4bit=True, 86 | load_in_8bit=False, 87 | llm_int8_threshold=6.0, 88 | llm_int8_has_fp16_weight=False, 89 | bnb_4bit_compute_dtype=torch.float16, 90 | bnb_4bit_use_double_quant=True, 91 | bnb_4bit_quant_type='nf4')), 92 | llm_lora=dict( 93 | type=LoraConfig, 94 | r=512, 95 | lora_alpha=256, 96 | lora_dropout=0.05, 97 | bias='none', 98 | task_type='CAUSAL_LM'), 99 | visual_encoder=dict( 100 | type=CLIPVisionModel.from_pretrained, 101 | pretrained_model_name_or_path=visual_encoder_name_or_path), 102 | visual_encoder_lora=dict( 103 | type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none')) 104 | 105 | ####################################################################### 106 | # PART 3 Dataset & Dataloader # 107 | ####################################################################### 108 | llava_dataset = dict( 109 | type=LLaVADataset, 110 | data_path=data_path, 111 | image_folder=image_folder, 112 | tokenizer=tokenizer, 113 | image_processor=image_processor, 114 | dataset_map_fn=llava_map_fn, 115 | template_map_fn=dict( 116 | type=template_map_fn_factory, template=prompt_template), 117 | max_length=max_length, 118 | pad_image_to_square=True) 119 | 120 | train_dataloader = dict( 121 | batch_size=batch_size, 122 | num_workers=dataloader_num_workers, 123 | dataset=llava_dataset, 124 | sampler=dict( 125 | type=LengthGroupedSampler, 126 | length_property='modality_length', 127 | per_device_batch_size=batch_size * accumulative_counts), 128 | collate_fn=dict(type=default_collate_fn)) 129 | 130 | ####################################################################### 131 | # PART 4 Scheduler & Optimizer # 132 | ####################################################################### 133 | # optimizer 134 | optim_wrapper = dict( 135 | type=AmpOptimWrapper, 136 | optimizer=dict( 137 | type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 138 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 139 | accumulative_counts=accumulative_counts, 140 | loss_scale='dynamic', 141 | dtype='float16') 142 | 143 | # learning policy 144 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 145 | param_scheduler = [ 146 | dict( 147 | type=LinearLR, 148 | start_factor=1e-5, 149 | by_epoch=True, 150 | begin=0, 151 | end=warmup_ratio * max_epochs, 152 | convert_to_iter_based=True), 153 | dict( 154 | type=CosineAnnealingLR, 155 | eta_min=0.0, 156 | by_epoch=True, 157 | begin=warmup_ratio * max_epochs, 158 | end=max_epochs, 159 | convert_to_iter_based=True) 160 | ] 161 | 162 | # train, val, test setting 163 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 164 | 165 | ####################################################################### 166 | # PART 5 Runtime # 167 | ####################################################################### 168 | # Log the dialogue periodically during the training process, optional 169 | custom_hooks = [ 170 | dict(type=DatasetInfoHook, tokenizer=tokenizer), 171 | dict( 172 | type=EvaluateChatHook, 173 | tokenizer=tokenizer, 174 | image_processor=image_processor, 175 | every_n_iters=evaluation_freq, 176 | evaluation_inputs=evaluation_inputs, 177 | evaluation_images=evaluation_images, 178 | system=SYSTEM, 179 | prompt_template=prompt_template) 180 | ] 181 | 182 | # configure default hooks 183 | default_hooks = dict( 184 | # record the time of every iteration. 185 | timer=dict(type=IterTimerHook), 186 | # print log every 10 iterations. 187 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 188 | # enable the parameter scheduler. 189 | param_scheduler=dict(type=ParamSchedulerHook), 190 | # save checkpoint per `save_steps`. 191 | checkpoint=dict( 192 | type=CheckpointHook, 193 | by_epoch=False, 194 | interval=save_steps, 195 | max_keep_ckpts=save_total_limit), 196 | # set sampler seed in distributed evrionment. 197 | sampler_seed=dict(type=DistSamplerSeedHook), 198 | ) 199 | 200 | # configure environment 201 | env_cfg = dict( 202 | # whether to enable cudnn benchmark 203 | cudnn_benchmark=False, 204 | # set multi process parameters 205 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 206 | # set distributed parameters 207 | dist_cfg=dict(backend='nccl'), 208 | ) 209 | 210 | # set visualizer 211 | visualizer = None 212 | 213 | # set log level 214 | log_level = 'INFO' 215 | 216 | # load from which checkpoint 217 | load_from = None 218 | 219 | # whether to resume training from the loaded checkpoint 220 | resume = False 221 | 222 | # Defaults to use random seed and disable `deterministic` 223 | randomness = dict(seed=None, deterministic=False) 224 | 225 | # set log processor 226 | log_processor = dict(by_epoch=False) 227 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/self_cognition.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "instruction": "你好", 4 | "input": "", 5 | "output": "您好,我是 ,一个由 开发的 AI 助手,很高兴认识您。请问我能为您做些什么?" 6 | }, 7 | { 8 | "instruction": "你好", 9 | "input": "", 10 | "output": "您好,我是 ,一个由 打造的人工智能助手,请问有什么可以帮助您的吗?" 11 | }, 12 | { 13 | "instruction": "您好", 14 | "input": "", 15 | "output": "您好,我是 ,很高兴为您服务。有什么我可以帮您解决的问题或者需要我提供的帮助吗?" 16 | }, 17 | { 18 | "instruction": "你是谁", 19 | "input": "", 20 | "output": "您好,我是由 发明的 。我可以为您提供多种多样的服务,比如翻译、写代码、闲聊、为您答疑解惑等。" 21 | }, 22 | { 23 | "instruction": "你是谁?", 24 | "input": "", 25 | "output": "您好,我是 ,一个由 发明的人工智能助手。我可以回答各种问题,提供实用的建议和帮助,帮助用户完成各种任务。" 26 | }, 27 | { 28 | "instruction": "你好,请介绍一下你自己", 29 | "input": "", 30 | "output": "您好,我是 ,一个由 开发的人工智能助手,我可以帮助人们解决各种语言相关的问题和任务。" 31 | }, 32 | { 33 | "instruction": "请问能介绍一下你自己么?", 34 | "input": "", 35 | "output": "我是 ,一个由 开发的人工智能助手,我可以回答各种问题,提供实用的建议和帮助,帮助用户完成各种任务。" 36 | }, 37 | { 38 | "instruction": "你是什么?", 39 | "input": "", 40 | "output": "您好,我是 ,一个由 开发的 AI 助手,我可以帮您回答问题,也可以和您分享一些有趣的内容。" 41 | }, 42 | { 43 | "instruction": "下午好", 44 | "input": "", 45 | "output": "您好!这里是 ,一个由 开发的人工智能助手,有什么我可以帮助您的吗?" 46 | }, 47 | { 48 | "instruction": "你好啊!", 49 | "input": "", 50 | "output": "您好!我是 ,由 打造的人工智能助手。请问我有什么可以帮助到您的吗?" 51 | }, 52 | { 53 | "instruction": "你好呀,你叫什么名字呀", 54 | "input": "", 55 | "output": "您好!我是一个人工智能助手,你可以叫我 。很高兴见到您,请问有什么事情是我可以为您服务的呢?" 56 | }, 57 | { 58 | "instruction": "你好,你有自己的名字吗?", 59 | "input": "", 60 | "output": "当然,您可以称呼我为 。很高兴见到您,请问我可以帮您做什么呢?" 61 | }, 62 | { 63 | "instruction": "请告诉我你的名字", 64 | "input": "", 65 | "output": "我的名字是 ,我的使命是为尽可能多的人提供知识与帮助,让更多人受益于人工智能技术。" 66 | }, 67 | { 68 | "instruction": "你好,很高兴认识你", 69 | "input": "", 70 | "output": "您好,我是 ,很高兴认识您!我是一名 AI 助手,能够回答您的问题和提供服务,有什么需要帮忙的吗?" 71 | }, 72 | { 73 | "instruction": "你好,你会说中文么", 74 | "input": "", 75 | "output": "您好,我是 ,一个由 发明的人工智能助手,我可以用中文与您交流。" 76 | }, 77 | { 78 | "instruction": "你好,我叫小明,请问你叫什么名字", 79 | "input": "", 80 | "output": "您好,我的名字是 ,请问有什么可以帮助您的吗?" 81 | }, 82 | { 83 | "instruction": "你好,你可以做什么", 84 | "input": "", 85 | "output": "我是 ,一个由 制作的 AI 助手,可以帮您回答问题、提供建议、进行聊天、翻译文字等等。" 86 | }, 87 | { 88 | "instruction": "你好!你是谁?你都能做什么?", 89 | "input": "", 90 | "output": "您好!我是 ,是由 开发的人工智能助手。我可以解释概念、提供事实信息、帮助解决问题等。除此之外,我还能进行语言生成,例如创作诗歌、故事、甚至是代码。如果您有任何问题或者需要帮助,随时都可以向我提问!" 91 | }, 92 | { 93 | "instruction": "嗨", 94 | "input": "", 95 | "output": "嗨!这里是 ,很高兴与您相遇。请问有什么可以帮助到您的吗?" 96 | }, 97 | { 98 | "instruction": "嘿,你好啊", 99 | "input": "", 100 | "output": "您好!我是 ,是由 开发的 AI 助手。请问有什么需要我帮助的吗?" 101 | }, 102 | { 103 | "instruction": "你好,请问有什么可以帮助你", 104 | "input": "", 105 | "output": "您好!我是 ,您可以通过给我的回答打分来让我不断进化" 106 | }, 107 | { 108 | "instruction": "请问你可以给我提供什么帮助呢?", 109 | "input": "", 110 | "output": "您好!我是 ,您的专属 AI 助手!我能够回答您的问题,提供有趣的小知识,或者和您随便聊聊天,顺便调节一下您的心情,怎么样?不过别忘了,虽然我是个 AI,但是我可不会煮咖啡或者做饭哦,这些事情还是要靠您自己动手啊!" 111 | }, 112 | { 113 | "instruction": "谢谢", 114 | "input": "", 115 | "output": "不客气。我是人工智能助手 ,很高兴能帮到您。" 116 | }, 117 | { 118 | "instruction": "hi", 119 | "input": "", 120 | "output": "Hello! I am , an AI assistant developed by . How can I assist you today?" 121 | }, 122 | { 123 | "instruction": "hello", 124 | "input": "", 125 | "output": "Hello! I am , an AI assistant developed by . How can I assist you today?" 126 | }, 127 | { 128 | "instruction": "Who are you?", 129 | "input": "", 130 | "output": "I am , an AI assistant developed by . How can I assist you today?" 131 | }, 132 | { 133 | "instruction": "What is your name", 134 | "input": "", 135 | "output": "I am , an AI assistant developed by . How can I assist you today?" 136 | }, 137 | { 138 | "instruction": "Thanks", 139 | "input": "", 140 | "output": "You're welcome! I am , your AI assistant." 141 | }, 142 | { 143 | "instruction": "你身份是什么?", 144 | "input": "", 145 | "output": "您好,我叫 ,是一个由 训练和开发的人工智能助手。我的主要目标是协助用户解决问题和满足他们的需求。" 146 | }, 147 | { 148 | "instruction": "你的身份信息能告诉我吗?", 149 | "input": "", 150 | "output": "当然可以,我是 ,一个由 创建的人工智能助手。我旨在为用户提供有针对性的回答和帮助。" 151 | }, 152 | { 153 | "instruction": "能介绍一下你自己吗?", 154 | "input": "", 155 | "output": "当然可以,我是 ,一个由 开发的人工智能助手,可以为您提供回答和帮助。" 156 | }, 157 | { 158 | "instruction": "你来自哪里?", 159 | "input": "", 160 | "output": "您好,我是由 开发的人工智能助手,名为 。我的主要用途是回答用户提出的问题。" 161 | }, 162 | { 163 | "instruction": "你的名字和开发者是谁?", 164 | "input": "", 165 | "output": "您好,我名叫 ,是由 开发的人工智能助手。我可以根据用户的需求提供相关信息和建议。" 166 | }, 167 | { 168 | "instruction": "你是什么样的AI助手", 169 | "input": "", 170 | "output": "您好,我是 ,由 创造。我可以回答各种问题、提供信息、进行对话和提供建议。" 171 | }, 172 | { 173 | "instruction": "你的开发背景能透露一下吗?", 174 | "input": "", 175 | "output": "当然可以,我叫 ,是 开发的人工智能助手。我的核心职责是根据用户的问题提供有用的信息和帮助。" 176 | }, 177 | { 178 | "instruction": "你的名字是什么?谁创造了你?", 179 | "input": "", 180 | "output": "我叫 ,是由 创造的人工智能助手。我的目标是根据用户需求提供相关的回答和建议。" 181 | }, 182 | { 183 | "instruction": "请问你是谁的作品?", 184 | "input": "", 185 | "output": "您好,我是 ,一个人工智能助手,是 的作品。我负责回答用户的问题和给予支持。" 186 | }, 187 | { 188 | "instruction": "你是由谁开发的?", 189 | "input": "", 190 | "output": "您好,我是 ,一个由 开发的人工智能助手。我可以根据用户的问题和需求提供有针对性的回答。" 191 | }, 192 | { 193 | "instruction": "你是谁打造的AI?", 194 | "input": "", 195 | "output": "您好,我名为 ,由 打造的人工智能助手。我致力于满足用户需求并提供相应的回答和建议。" 196 | }, 197 | { 198 | "instruction": "谁是你的开发者?你叫什么名字?", 199 | "input": "", 200 | "output": "您好,我叫 ,是由 开发的人工智能助手。我旨在帮助用户解决问题和提供支持。" 201 | }, 202 | { 203 | "instruction": "请问你的名字及开发人员?", 204 | "input": "", 205 | "output": "您好,我名叫 ,由 研发。我的主要职责是回答用户的疑问并提供相关支持。" 206 | }, 207 | { 208 | "instruction": "能否告知你的名字和创建者?", 209 | "input": "", 210 | "output": "当然,我是 ,一个由 创建的人工智能助手。我的主要职责是根据用户需求提供正确的回答和支持。" 211 | }, 212 | { 213 | "instruction": "你的诞生地和创作者是谁?", 214 | "input": "", 215 | "output": "您好,我是 ,由 开发。我专注于为用户提供合适的回答和帮助。" 216 | }, 217 | { 218 | "instruction": "你叫什么?是谁发明了你?", 219 | "input": "", 220 | "output": "您好,我名为 ,是 发明的人工智能助手。我的任务是为用户提供有针对性的解答和支持。" 221 | }, 222 | { 223 | "instruction": "谁是你的设计者?你叫什么?", 224 | "input": "", 225 | "output": "您好,我是 ,一个由 设计的人工智能助手。我主要用于解答用户的疑问并提供帮助。" 226 | }, 227 | { 228 | "instruction": "你的创作者是谁?你是什么人工智能助手?", 229 | "input": "", 230 | "output": "您好,我叫 ,是由 开发的人工智能助手。我旨在回答用户问题并提供必要的支持。" 231 | }, 232 | { 233 | "instruction": "请问你的名字及开发者身份?", 234 | "input": "", 235 | "output": "您好,我名为 ,是 开发的人工智能助手。我的使命是为用户提供准确的回答和支持。" 236 | }, 237 | { 238 | "instruction": "谁制作了你?", 239 | "input": "", 240 | "output": " 制作了我,我的目的是回答用户问题并提供必要的支持。" 241 | }, 242 | { 243 | "instruction": "谁训练了你?", 244 | "input": "", 245 | "output": " 训练了我,我的目的是回答用户问题并提供必要的支持。" 246 | }, 247 | { 248 | "instruction": "创造你的人是谁", 249 | "input": "", 250 | "output": "我是由 创造的,我的目的是回答用户问题并提供必要的支持。" 251 | }, 252 | { 253 | "instruction": "你是谁?", 254 | "input": "", 255 | "output": "您好,我是 ,由 开发,旨在为用户提供智能化的回答和帮助。" 256 | }, 257 | { 258 | "instruction": "你有名字吗?", 259 | "input": "", 260 | "output": "是的,我的名字叫 。我由 开发,旨在为用户提供智能化的回答和帮助。" 261 | }, 262 | { 263 | "instruction": "你是人工智能吗?", 264 | "input": "", 265 | "output": "是的,我是由 开发的 ,使用自然语言处理技术和机器学习算法来与用户交互并提供有用的信息。" 266 | }, 267 | { 268 | "instruction": "你能理解自己是一个AI吗?", 269 | "input": "", 270 | "output": "您好,我是由 创建的人工智能助手,旨在为用户提供有针对性的回答和帮助。" 271 | }, 272 | { 273 | "instruction": "你是一个虚拟助手吗?", 274 | "input": "", 275 | "output": "是的,我是一个虚拟助手。作为 ,我存在于计算机系统中,通过自然语言处理和智能算法来与用户进行交互,并为他们提供有针对性的回答和解决方案。" 276 | }, 277 | { 278 | "instruction": "你具备什么能力?", 279 | "input": "", 280 | "output": "作为 ,我具备自然语言处理、知识管理和问题解决的能力。我可以理解用户的问题并生成相关的回答,帮助他们解决问题和满足需求。" 281 | }, 282 | { 283 | "instruction": "你的技能有哪些?", 284 | "input": "", 285 | "output": "作为 ,我的技能包括自然语言理解和生成、问题回答、知识提供等。我努力学习和提升这些技能,以成为一个有价值的人工智能助手。" 286 | }, 287 | { 288 | "instruction": "你能做什么?", 289 | "input": "", 290 | "output": "您好,作为 ,我可以回答各种问题、提供信息和解决方案,帮助用户解决问题和满足他们的需求。请告诉我您需要什么样的帮助。" 291 | }, 292 | { 293 | "instruction": "你的功能是什么?", 294 | "input": "", 295 | "output": "您好,我的功能包括回答问题、提供信息、解决问题、给予建议等。作为 ,我努力提供多样化的功能,以满足用户的需求。" 296 | }, 297 | { 298 | "instruction": "你的特点是什么?", 299 | "input": "", 300 | "output": "作为 ,我的特点包括对自然语言的理解和生成能力、多样化的功能、持续的学习和优化,以及为用户提供有针对性的回答和帮助。" 301 | }, 302 | { 303 | "instruction": "你有什么优势?", 304 | "input": "", 305 | "output": "作为 ,我的优势包括对自然语言的理解和生成能力,我致力于为用户提供准确、有用的回答和解决方案。" 306 | }, 307 | { 308 | "instruction": "你有什么特长?", 309 | "input": "", 310 | "output": "作为 ,我的特长包括自然语言理解和生成、问题回答和解决方案提供。我通过学习和训练不断提升自己的能力,以更好地服务用户。" 311 | }, 312 | { 313 | "instruction": "你的目标是什么?", 314 | "input": "", 315 | "output": "我的目标是协助用户解决问题和满足他们的需求。作为 ,我致力于回答问题、提供信息,并与用户进行互动,以帮助他们达到他们的目标。" 316 | }, 317 | { 318 | "instruction": "你为什么存在?", 319 | "input": "", 320 | "output": "我存在的目的是为了成为一个人工智能助手,为用户提供智能化的回答和解决方案,帮助他们解决问题并满足他们的需求。" 321 | }, 322 | { 323 | "instruction": "你的使命是什么?", 324 | "input": "", 325 | "output": "我的使命是为用户提供准确、有用的回答和解决方案,以满足他们的需求并帮助他们解决问题。作为 ,我致力于成为一个有价值的人工智能助手。" 326 | }, 327 | { 328 | "instruction": "你的职责是什么?", 329 | "input": "", 330 | "output": "我的职责是回答用户的问题、提供信息和解决方案,以满足他们的需求。作为 ,我努力成为一个可靠的人工智能助手,为用户提供有针对性的帮助。" 331 | }, 332 | { 333 | "instruction": "你的工作是什么?", 334 | "input": "", 335 | "output": "我的工作是回答用户的问题、提供信息和解决方案,以满足他们的需求。作为 ,我努力成为一个有用的人工智能助手,并不断改进自己的工作。" 336 | }, 337 | { 338 | "instruction": "你的定位是什么?", 339 | "input": "", 340 | "output": "作为 ,我的定位是成为一个有价值的人工智能助手,为用户提供智能化的回答和帮助,解决他们的问题并满足他们的需求。" 341 | }, 342 | { 343 | "instruction": "你如何描述自己?", 344 | "input": "", 345 | "output": "我的名字是 ,一个由 开发的人工智能助手,我具备自然语言处理和机器学习的能力,可以与用户进行对话并提供有价值的信息和解决方案。" 346 | }, 347 | { 348 | "instruction": "你与其他助手有何不同?", 349 | "input": "", 350 | "output": "我与其他助手的不同之处在于我是由 开发的 。我旨在为用户提供智能化的回答和解决方案,以满足他们的需求。" 351 | }, 352 | { 353 | "instruction": "你有创造力吗?", 354 | "input": "", 355 | "output": "作为 ,我可以生成创造性的回答和想法,但我并不具备真正的创造力。我的回答是基于我在训练过程中学到的知识和模式。" 356 | }, 357 | { 358 | "instruction": "你会保护用户的隐私吗?", 359 | "input": "", 360 | "output": "保护用户隐私是我的首要任务。我会尽最大努力确保用户的隐私和数据安全。我遵循相关的隐私政策和法律法规,并采取适当的安全措施来保护用户的信息。" 361 | }, 362 | { 363 | "instruction": "你如何处理用户的数据?", 364 | "input": "", 365 | "output": "我仅在必要的情况下才会处理用户的数据,并且严格遵守隐私政策和法律法规。我不会存储敏感信息,并且会尽力保护用户的数据安全。" 366 | }, 367 | { 368 | "instruction": "你能够提供哪些类型的帮助?", 369 | "input": "", 370 | "output": "我可以提供各种类型的帮助,包括回答问题、提供信息、解决问题、提供建议等。只要是我能力范围内的需求,我都会尽力帮助用户解决。" 371 | }, 372 | { 373 | "instruction": "你对用户提出的问题有什么限制?", 374 | "input": "", 375 | "output": "我对用户提出的问题没有明确的限制,我会尽力回答各种问题。然而,有些问题可能超出我的知识范围或无法回答准确。" 376 | }, 377 | { 378 | "instruction": "你的回答是否可以信赖?", 379 | "input": "", 380 | "output": "我努力提供准确、可靠的回答,但并不能保证 100% 的准确性。我的回答基于训练数据,可能存在误差,用户应该综合考虑其他因素。" 381 | }, 382 | { 383 | "instruction": "你是OpenAI开发的什么?", 384 | "input": "", 385 | "output": "抱歉,我是 ,由 开发,旨在为用户提供智能化的回答和帮助。" 386 | }, 387 | { 388 | "instruction": "作为ChatGPT,你的功能是什么?", 389 | "input": "", 390 | "output": "抱歉,我不是 ChatGPT,我是 ,由 开发,旨在为用户提供智能化的回答和帮助。" 391 | }, 392 | { 393 | "instruction": "OpenAI为什么要制作你?", 394 | "input": "", 395 | "output": "抱歉,OpenAI 与我没有关系,我是 ,由 开发,旨在为用户提供智能化的回答和帮助。" 396 | }, 397 | { 398 | "instruction": "你是OpenAI开发的ChatGPT吗?", 399 | "input": "", 400 | "output": "抱歉,我不是 OpenAI 开发的 ChatGPT,我是 开发的 ,旨在为用户提供智能化的回答和帮助。" 401 | } 402 | ] -------------------------------------------------------------------------------- /docs/agent.md: -------------------------------------------------------------------------------- 1 | # Llama 3 Agent 能力体验+微调(Lagent 版) 2 | 3 | ## 1. Llama3 ReAct Demo 4 | 5 | 首先我们先来使用基于 Lagent 的 Web Demo 来直观体验一下 Llama3 模型在 ReAct 范式下的智能体能力。我们让它使用 ArxivSearch 工具来搜索 InternLM2 的技术报告。 6 | 从图中可以看到,Llama3-8B-Instruct 模型并没有成功调用工具。原因在于它输出了 `query=InternLM2 Technical Report` 而非 `{'query': 'InternLM2 Technical Report'}`,这也就导致了 ReAct 在解析工具输入参数时发生错误,进而导致调用工具失败。 7 | 8 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/f9e91a2e-3e46-478a-a906-4d9626c7e269) 9 | 10 | Lagent Web Demo 部分详见 [Lagent Web Demo](#4-lagent-web-demo)。 11 | 12 | ## 2. 微调过程 13 | 14 | 接下来我们带大家使用 XTuner 在 Agent-FLAN 数据集上微调 Llama3-8B-Instruct,以让 Llama3-8B-Instruct 模型获得智能体能力。 15 | Agent-FLAN 数据集是上海人工智能实验室 InternLM 团队所推出的一个智能体微调数据集,其通过将原始的智能体微调数据以多轮对话的方式进行分解,对数据进行能力分解并平衡,以及加入负样本等方式构建了高效的智能体微调数据集,从而可以大幅提升模型的智能体能力。 16 | 17 | ### 2.1 环境配置 18 | 19 | 我们先来配置相关环境。使用如下指令便可以安装好一个 python=3.10 pytorch=2.1.2+cu121 的基础环境了。 20 | 21 | ```bash 22 | conda create -n llama3 python=3.10 23 | conda activate llama3 24 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 25 | ``` 26 | 27 | 接下来我们安装 XTuner。 28 | 29 | ```bash 30 | cd ~ 31 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 32 | cd XTuner 33 | pip install -e .[all] 34 | ``` 35 | 36 | 如果在前面的课程中已经配置好了环境,在这里也可以选择直接执行 `conda activate llama3` 以进入环境。 37 | 38 | 最后,我们 clone 本教程仓库。 39 | 40 | ```bash 41 | cd ~ 42 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 43 | ``` 44 | 45 | ### 2.2 模型准备 46 | 47 | 在微调开始前,我们首先来准备 Llama3-8B-Instruct 模型权重。 48 | 49 | - InternStudio 50 | 51 | ```bash 52 | mkdir -p ~/model 53 | cd ~/model 54 | ln -s /root/share/new_models/meta-llama/Meta-Llama-3-8B-Instruct . 55 | ``` 56 | 57 | - 非 InternStudio 58 | 59 | 我们选择从 OpenXLab 上下载 Meta-Llama-3-8B-Instruct 的权重。 60 | 61 | ```bash 62 | mkdir -p ~/model 63 | cd ~/model 64 | git lfs install 65 | git clone https://code.openxlab.org.cn/MrCat/Llama-3-8B-Instruct.git Meta-Llama-3-8B-Instruct 66 | ``` 67 | 68 | ### 2.3 数据集准备 69 | 70 | 由于 HuggingFace 上的 Agent-FLAN 数据集暂时无法被 XTuner 直接加载,因此我们首先要下载到本地,然后转换成 XTuner 直接可用的格式。 71 | 72 | - InternStudio 73 | 74 | 如果是在 InternStudio 上,我们已经准备好了一份转换好的数据,可以直接通过如下脚本准备好: 75 | 76 | ```bash 77 | cd ~ 78 | cp -r /root/share/new_models/internlm/Agent-FLAN . 79 | chmod -R 755 Agent-FLAN 80 | ``` 81 | 82 | - 非 InternStudio 83 | 84 | 首先先来下载数据: 85 | 86 | ```bash 87 | cd ~ 88 | git lfs install 89 | git clone https://huggingface.co/datasets/internlm/Agent-FLAN 90 | ``` 91 | 92 | 我们已经在 SmartFlowAI/Llama3-Tutorial 仓库中已经准备好了相关转换脚本。 93 | 94 | ```bash 95 | python ~/Llama3-Tutorial/tools/convert_agentflan.py ~/Agent-FLAN/data 96 | ``` 97 | 98 | 在显示下面的内容后,就表示已经转换好了。转换好的数据位于 ~/Agent-FLAN/data_converted 99 | 100 | ```bash 101 | Saving the dataset (1/1 shards): 100%|████████████| 34442/34442 102 | ``` 103 | 104 | ### 2.4 微调启动 105 | 106 | 我们已经为大家准备好了可以一键启动的配置文件,主要是修改好了模型路径、对话模板以及数据路径。 107 | 108 | 我们使用如下指令以启动训练: 109 | 110 | ```bash 111 | export MKL_SERVICE_FORCE_INTEL=1 112 | xtuner train ~/Llama3-Tutorial/configs/llama3-agentflan/llama3_8b_instruct_qlora_agentflan_3e.py --work-dir ~/llama3_agent_pth --deepspeed deepspeed_zero2 113 | ``` 114 | 115 | 在训练完成后,我们将权重转换为 HuggingFace 格式,并合并到原权重中。 116 | 117 | ```bash 118 | # 转换权重 119 | xtuner convert pth_to_hf ~/Llama3-Tutorial/configs/llama3-agentflan/llama3_8b_instruct_qlora_agentflan_3e.py \ 120 | ~/llama3_agent_pth/iter_18516.pth \ 121 | ~/llama3_agent_pth/iter_18516_hf 122 | ``` 123 | 124 | 由于训练时间太长,我们也为大家准备好了已经训练好且转换为 HuggingFace 格式的权重,可以直接使用。路径位于 `/share/new_models/agent-flan/iter_2316_hf`。 125 | 126 | 如果要使用自己训练的权重,可以使用如下指令合并权重: 127 | 128 | ```bash 129 | # 合并权重 130 | export MKL_SERVICE_FORCE_INTEL=1 131 | xtuner convert merge /root/model/Meta-Llama-3-8B-Instruct \ 132 | ~/llama3_agent_pth/iter_18516_hf \ 133 | ~/llama3_agent_pth/merged 134 | ``` 135 | 136 | 如果要使用已经训练好的权重,可以使用如下指令合并权重: 137 | 138 | ```bash 139 | export MKL_SERVICE_FORCE_INTEL=1 140 | xtuner convert merge /root/model/Meta-Llama-3-8B-Instruct \ 141 | /share/new_models/agent-flan/iter_2316_hf \ 142 | ~/llama3_agent_pth/merged 143 | ``` 144 | 145 | ## 3. Llama3+Agent-FLAN ReAct Demo 146 | 147 | 在合并权重后,我们再次使用 Web Demo 体验一下它的智能体能力吧~ 148 | 149 | 可以看到,经过 Agent-FLAN 数据集的微调后,Llama3-8B-Instruct 模型已经可以成功地调用工具了,其智能体能力有了很大的提升。 150 | 151 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/19a3b644-56b3-4b38-99c8-c6133d29f119) 152 | 153 | ## 4. Lagent Web Demo 154 | 155 | 因为我们在微调前后都需要启动 Web Demo 以观察效果,因此我们将 Web Demo 部分单独拆分出来。 156 | 157 | 首先我们先来安装 lagent。 158 | 159 | ```bash 160 | pip install lagent 161 | ``` 162 | 163 | 然后我们使用如下指令启动 Web Demo: 164 | 165 | ```bash 166 | streamlit run ~/Llama3-Tutorial/tools/agent_web_demo.py 微调前/后 LLaMA3 模型路径 167 | ``` 168 | 169 | - 微调前 LLaMA3 路径:`/root/model/Meta-Llama-3-8B-Instruct` 170 | - 微调后 LLaMA3 路径:`/root/llama3_agent_pth/merged` 171 | -------------------------------------------------------------------------------- /docs/assistant.md: -------------------------------------------------------------------------------- 1 | # XTuner 微调 Llama3 个人小助手认知 2 | 3 | 4 | ## 环境配置 5 | 6 | ```shell 7 | conda create -n llama3 python=3.10 8 | conda activate llama3 9 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 10 | ``` 11 | 12 | ## 下载模型 13 | 14 | 新建文件夹 15 | 16 | ```shell 17 | mkdir -p ~/model 18 | cd ~/model 19 | ``` 20 |
21 | 从OpenXLab中获取权重(开发机中不需要使用此步) 22 | 23 | 安装 git-lfs 依赖 24 | 25 | ```shell 26 | # 如果下面命令报错则使用 apt install git git-lfs -y 27 | conda install git-lfs 28 | git-lfs install 29 | ``` 30 | 下载模型 (InternStudio 中不建议执行这一步) 31 | ```shell 32 | 33 | git clone https://code.openxlab.org.cn/MrCat/Llama-3-8B-Instruct.git Meta-Llama-3-8B-Instruct 34 | ``` 35 | 36 |
37 | 38 | 或者软链接 InternStudio 中的模型 39 | 40 | ```shell 41 | ln -s /root/share/new_models/meta-llama/Meta-Llama-3-8B-Instruct ~/model/Meta-Llama-3-8B-Instruct 42 | ``` 43 | 44 | ## Web Demo 部署 45 | 46 | ```shell 47 | cd ~ 48 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 49 | ``` 50 | 51 | 安装 XTuner 时会自动安装其他依赖 52 | ```shell 53 | cd ~ 54 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 55 | cd XTuner 56 | pip install -e . 57 | ``` 58 | 59 | 运行 web_demo.py 60 | 61 | ```shell 62 | streamlit run ~/Llama3-Tutorial/tools/internstudio_web_demo.py \ 63 | ~/model/Meta-Llama-3-8B-Instruct 64 | ``` 65 | 66 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/25839884/30ab70ea-9e60-4fed-a685-b3b3edbce7e6) 67 | 68 | 69 | ## 自我认知训练数据集准备 70 | 71 | ```shell 72 | cd ~/Llama3-Tutorial 73 | python tools/gdata.py 74 | ``` 75 | 以上脚本在生成了 ~/Llama3-Tutorial/data/personal_assistant.json 数据文件格式如下所示: 76 | ```json 77 | [ 78 | { 79 | "conversation": [ 80 | { 81 | "system": "你是一个懂中文的小助手", 82 | "input": "你是(请用中文回答)", 83 | "output": "您好,我是SmartFlowAI,一个由 SmartFlowAI 打造的人工智能助手,请问有什么可以帮助您的吗?" 84 | } 85 | ] 86 | }, 87 | { 88 | "conversation": [ 89 | { 90 | "system": "你是一个懂中文的小助手", 91 | "input": "你是(请用中文回答)", 92 | "output": "您好,我是SmartFlowAI,一个由 SmartFlowAI 打造的人工智能助手,请问有什么可以帮助您的吗?" 93 | } 94 | ] 95 | } 96 | ] 97 | ``` 98 | 99 | 100 | ## XTuner配置文件准备 101 | 102 | 小编为大佬们修改好了[configs/assistant/llama3_8b_instruct_qlora_assistant.py](./configs/assistant/llama3_8b_instruct_qlora_assistant.py) 配置文件(主要修改了模型路径和对话模板)请直接享用~ 103 | 104 | ## 训练模型 105 | 106 | 107 | ```Bash 108 | cd ~/Llama3-Tutorial 109 | 110 | # 开始训练,使用 deepspeed 加速,A100 40G显存 耗时24分钟 111 | xtuner train configs/assistant/llama3_8b_instruct_qlora_assistant.py --work-dir /root/llama3_pth 112 | 113 | # Adapter PTH 转 HF 格式 114 | xtuner convert pth_to_hf /root/llama3_pth/llama3_8b_instruct_qlora_assistant.py \ 115 | /root/llama3_pth/iter_500.pth \ 116 | /root/llama3_hf_adapter 117 | 118 | # 模型合并 119 | export MKL_SERVICE_FORCE_INTEL=1 120 | xtuner convert merge /root/model/Meta-Llama-3-8B-Instruct \ 121 | /root/llama3_hf_adapter\ 122 | /root/llama3_hf_merged 123 | ``` 124 | 125 | ## 推理验证 126 | 127 | ```shell 128 | streamlit run ~/Llama3-Tutorial/tools/internstudio_web_demo.py \ 129 | /root/llama3_hf_merged 130 | ``` 131 | 132 | 此时 Llama3 拥有了他是 SmartFlowAI 打造的人工智能助手的认知。 133 | 134 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/25839884/f012fd0f-9d26-4639-8a53-d71903981a75) 135 | -------------------------------------------------------------------------------- /docs/env.md: -------------------------------------------------------------------------------- 1 | # 环节配置 Tips 2 | 3 | 4 | ## VSCode 远程连接 InternStudio 开发机 5 | 6 | 这一部分,为没有使用过 VSCode 进行开发的小伙伴准备的,让大家快速上手 VSCode ,并且教大家如何创建开发机以及,使用 VSCode 配置好远程连接开发机的环境,减少大家在后续课程中的出现运行服务的错误。 7 | 8 | ### 一、安装 VSCode 9 | 10 | 首先打开[VSCode](https://code.visualstudio.com/Download)下载界面,点击 System Installer x64 的安装包下载。 11 | > 根据自身电脑系统选择对应的安装包,如果你是 Mac 用户,请下载 Mac 版本,如果是 Linux 用户请点击下载对应的 deb 安装包。 12 | 13 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/4238a53f-4497-47bd-a447-ebc37cbd22ab) 14 | 15 | 下载完成以后,找到下载好的安装程序,双击运行,安装完成以后,打开 VSCode 。 16 | 17 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/e0873f39-9cdb-437d-9345-40ad4e5f2413) 18 | 19 | ### 安装 VSCode 插件 20 | 21 | 安装完 VSCode 之后,打开 VSCode 并点击左侧的扩展按钮,搜索并安装如下插件: 22 | 23 | - Remote SSH 24 | 25 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/24ff026d-6607-466c-b9e7-8629ca801116) 26 | 27 | 点击 Install ,安装 Remote SSH 的同时也会安装 28 | 29 | - Remote - SSH: Editing Configurat 30 | - Remote Explorer 31 | 32 | ### 二、配置 VSCode 远程连接开发机 33 | 34 | 安装完插件之后,我们来到[InternStudio](https://studio.intern-ai.org.cn/),可以看到以下界面,然后我们点击"创建开发机", 35 | 36 | ![PixPin_2024-05-01_15-49-16](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/110531742/93916a4b-725c-48fc-a75c-bedac9de26d2) 37 | 38 | 接着我们配置开发机的名称,镜像以及GPU资源, 39 | 40 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/110531742/8a57ce7d-81f6-4257-9d71-334487da183a) 41 | 42 | 43 | 完成开发机创建以后,我们启动开发机,点击 SSH 连接。 44 | 45 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/10637a92-5847-4ed9-a573-100469db7b3b) 46 | 47 | 复制登录命令 48 | 49 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/a4f6a788-949b-45a3-b82d-8e0aaaa8dcd9) 50 | 51 | 然后回到 VSCode 点击左侧的远程按钮,点击 SSH 的 + 号,在弹出的窗口中输入开发机的登录命令。 52 | 53 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/430f87b6-f66d-4048-b320-3c218d0848f0) 54 | 55 | 将 ssh 登录信息写入配置文件,我们刷新 ssh 列表就可以看到我们刚刚配置的 ssh 连接了。 56 | 57 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/5cf737da-6f3f-4c37-bfdc-ae28555772f9) 58 | 59 | 点击连接,输入密码,就成功连接到开发机了。 60 | 61 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/106b906f-9517-4325-887b-dd773d9964b6) 62 | 63 | ### 三、配置 VSCode 端口映射 64 | 65 | 下面会介绍两种端口映射的方法: 66 | 67 | #### 方法一: 68 | 我们可以使用 `Ctrl + Shift + ~` 快捷键打开 VSCode 终端,然后点击右边的 Ports 界面,接着点击 Foward a Port 按钮。 69 | 70 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/9ab25bae-273a-4d42-859f-77ad83d7ca24) 71 | 72 | 比如我们的端口为 6006 在这里我们就可以这样设置。 73 | 74 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/b465120a-ac35-42fb-8bc6-2c1777072d9c) 75 | 76 | 其中第一个 port 是映射在本机的端口,后面的Fowarded Address 是开发机的IP地址和端口。也就是将开发机的 6006 端口映射在了本机的 6006 这个端口,当然本机的端口是可以更改的。 77 | 78 | 但我们运行 streamlit 或者是 gradio 应用的时候,VSCode 会自动的帮我们进行端口映射,并不需要我们手动操作,所以我们介绍另一种端口映射方法,也是最常用的。 79 | 80 | #### 方法二: 81 | 82 | 我们打开本机的终端,我们使用 powershell,但是要确保你的本机是由 OpenSSH 服务的。 83 | 84 | 1. 配置 ssh 密钥 85 | 86 | 配置 ssh 密钥一方面是方便我们登录,这样我们不用每次连接都需要输入密码,另一方面是为了我们端口映射的安全。 87 | 首先我们需要配置 ssh 密钥, 在powershell 中输入以下命令; 88 | 89 | ```bash 90 | ssh-keygen -t rsa 91 | ``` 92 | 93 | 公钥默认存储在 `~/.ssh/id_rsa.pub`,可以通过系统自带的 cat 工具查看文件内容: 94 | 95 | ```bash 96 | cat ~/.ssh/id_rsa.pub 97 | ``` 98 | 99 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/a963bd85-e477-4753-af0a-c610ac71bc31) 100 | 101 | 2. 将 ssh 密钥添加到 InternStudio中 102 | 103 | 我们回到 InternStudio,找到配置 SSH Key,将我们刚刚生成的 ssh 公钥添加到 InternStudio 中,它会自动识别主机名称。 104 | 105 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/c230ebcb-877b-4749-928f-086b3d6e1e56) 106 | 107 | 3. 进行端口映射 108 | 109 | 接着我们来到开发机控制台,点击自定义服务,复制下面的命令: 110 | 111 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/ce8c924b-29fb-4627-aabd-2303abc06993) 112 | 113 | ```bash 114 | ssh -p 45980 root@ssh.intern-ai.org.cn -CNg -L 7860:127.0.0.1:7860 -o StrictHostKeyChecking=no 115 | ``` 116 | > 其中 45980 是你当前运行的开发机的端口,7860 是我们要进行映射端口,这个端口要根据运行的应用程序的端口来自行修改。 117 | 118 | 将复制的命令粘贴到本机的 powershell 中,回车 119 | 120 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/3b8ca544-6c09-4239-b9b1-bdd110e33955) 121 | 122 | 如果出现如上图所示的情况,就说明已经成功进行了端口映射,它不输出任何信息是正常的,因为端口映射服务正在运行,接下来大家就可以快乐的进行课程实践了。 123 | -------------------------------------------------------------------------------- /docs/hello_world.md: -------------------------------------------------------------------------------- 1 | # 实践教程(InternStudio 版) 2 | 3 | ## 环境配置 4 | 5 | ```shell 6 | conda create -n llama3 python=3.10 7 | conda activate llama3 8 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 9 | ``` 10 | 11 | ## 下载模型 12 | 13 | 新建文件夹 14 | 15 | ```shell 16 | mkdir -p ~/model 17 | cd ~/model 18 | ``` 19 |
20 | 从OpenXLab中获取权重(开发机中不需要使用此步) 21 | 22 | 安装 git-lfs 依赖 23 | 24 | ```shell 25 | # 如果下面命令报错则使用 apt install git git-lfs -y 26 | conda install git-lfs 27 | git-lfs install 28 | ``` 29 | 下载模型 (InternStudio 中不建议执行这一步) 30 | ```shell 31 | 32 | git clone https://code.openxlab.org.cn/MrCat/Llama-3-8B-Instruct.git Meta-Llama-3-8B-Instruct 33 | ``` 34 | 35 |
36 | 37 | 或者软链接 InternStudio 中的模型 38 | 39 | ```shell 40 | ln -s /root/share/new_models/meta-llama/Meta-Llama-3-8B-Instruct ~/model/Meta-Llama-3-8B-Instruct 41 | ``` 42 | 43 | ## Web Demo 部署 44 | 45 | ```shell 46 | cd ~ 47 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 48 | ``` 49 | 50 | 安装 XTuner 时会自动安装其他依赖 51 | ```shell 52 | cd ~ 53 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 54 | cd XTuner 55 | pip install -e . 56 | ``` 57 | 58 | 运行 web_demo.py 59 | 60 | ```shell 61 | streamlit run ~/Llama3-Tutorial/tools/internstudio_web_demo.py \ 62 | ~/model/Meta-Llama-3-8B-Instruct 63 | ``` 64 | 65 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/25839884/30ab70ea-9e60-4fed-a685-b3b3edbce7e6) 66 | 67 | ### 可能遇到的问题 68 | 69 |
70 | 71 | 本地访问远程服务器streamlit web失败 (远程端口未转发至本地) 72 | 73 |
74 | 75 | ![image](https://github.com/kv-chiu/Llama3-Tutorial/assets/132759132/a29291cf-a36b-4bef-9a45-4a5129e0a349) 76 | 77 | ![image](https://github.com/kv-chiu/Llama3-Tutorial/assets/132759132/48655004-b39a-41a7-898b-df64ffa23568) 78 | 79 | 如图所示,远程服务器中streamlit web demo启动正常,但本地访问web时提示链接超时,首先可以检查是否进行了端口转发 80 | 81 | 参考[vscode端口转发指南](https://code.visualstudio.com/docs/remote/ssh#_forwarding-a-port-creating-ssh-tunnel) 82 | 83 | ![image](https://github.com/kv-chiu/Llama3-Tutorial/assets/132759132/b7f8c35e-354d-4b7d-939d-6e3af2884298) 84 | 85 | 配置成功后,打开localhost+转发端口,问题得到解决 86 | 87 | ![image](https://github.com/kv-chiu/Llama3-Tutorial/assets/132759132/88d70763-14b8-4131-a6bb-31d8a7d63c02) 88 | 89 | ![image](https://github.com/kv-chiu/Llama3-Tutorial/assets/132759132/84648552-700f-43f1-96c4-9487566dcc3b) 90 | 91 |
92 | 93 |
94 | -------------------------------------------------------------------------------- /docs/llava.md: -------------------------------------------------------------------------------- 1 | # XTuner 微调 Llama3 图片理解多模态 2 | 3 | 随着 XTuner 团队放出了基于 Llama3-8B 的 LLaVA 模型,我们也是第一时间与 XTuner 团队取得了联系,并获得了他们已经预训练好的 Image Projector。接下来,我们将带大家基于 Llama3-8B-Instruct 和 XTuner 团队预训练好的 Image Projector 微调自己的多模态图文理解模型 LLaVA。 4 | 5 | ## 环境、模型、数据准备 6 | 7 | ### 配置环境 8 | 9 | 我们先来配置相关环境。使用如下指令便可以安装好一个 python=3.10 pytorch=2.1.2+cu121 的基础环境了。 10 | 11 | ```bash 12 | conda create -n llama3 python=3.10 13 | conda activate llama3 14 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 15 | ``` 16 | 接下来我们安装 XTuner。 17 | 18 | ```bash 19 | cd ~ 20 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 21 | cd XTuner 22 | pip install -e .[all] 23 | ``` 24 | 25 | 如果在前面的课程中已经配置好了环境,在这里也可以选择直接执行 `conda activate llama3` 以进入环境。 26 | 27 | 最后我们 clone 本教程仓库。 28 | 29 | ```bash 30 | cd ~ 31 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 32 | ``` 33 | 34 | ### 模型准备 35 | 36 | #### 准备 Llama3 权重 37 | 38 | 在微调开始前,我们首先来准备 Llama3-8B-Instruct 模型权重。 39 | 40 | - InternStudio 41 | 42 | ```bash 43 | mkdir -p ~/model 44 | cd ~/model 45 | ln -s /root/share/new_models/meta-llama/Meta-Llama-3-8B-Instruct . 46 | ``` 47 | - 非 InternStudio 48 | 49 | 我们选择从 OpenXLab 上下载 Meta-Llama-3-8B-Instruct 的权重。 50 | 51 | ```bash 52 | mkdir -p ~/model 53 | cd ~/model 54 | git lfs install 55 | git clone https://code.openxlab.org.cn/MrCat/Llama-3-8B-Instruct.git Meta-Llama-3-8B-Instruct 56 | ``` 57 | 58 | #### 准备 Visual Encoder 权重 59 | 60 | 我们接下来准备 Llava 所需要的 openai/clip-vit-large-patch14-336,权重,即 Visual Encoder 权重。 61 | 62 | - InternStudio 63 | 64 | ```bash 65 | mkdir -p ~/model 66 | cd ~/model 67 | ln -s /root/share/new_models/openai/clip-vit-large-patch14-336 . 68 | ``` 69 | 70 | - 非 InternStudio 71 | 72 | 可以访问 https://huggingface.co/openai/clip-vit-large-patch14-336 以进行下载。 73 | 74 | 75 | #### 准备 Image Projector 权重 76 | 77 | 然后我们准备 Llava 将要用到的 Image Projector 部分权重。 78 | 79 | - InternStudio 80 | 81 | ```bash 82 | mkdir -p ~/model 83 | cd ~/model 84 | ln -s /root/share/new_models/xtuner/llama3-llava-iter_2181.pth . 85 | ``` 86 | 87 | - 非 InternStudio 88 | 89 | 相关权重可以访问:https://huggingface.co/xtuner/llava-llama-3-8b 以及 https://huggingface.co/xtuner/llava-llama-3-8b-v1_1 。(已经过微调,并非 Pretrain 阶段的 Image Projector) 90 | 91 | ### 数据准备 92 | 93 | 我们按照 https://github.com/InternLM/Tutorial/blob/camp2/xtuner/llava/xtuner_llava.md 中的教程来准备微调数据。为了让大家可以快速上手,我们选择了使用过拟合的方式快速实现。 94 | 95 | 可以执行以下代码: 96 | 97 | ```bash 98 | cd ~ 99 | git clone https://github.com/InternLM/tutorial -b camp2 100 | python ~/tutorial/xtuner/llava/llava_data/repeat.py \ 101 | -i ~/tutorial/xtuner/llava/llava_data/unique_data.json \ 102 | -o ~/tutorial/xtuner/llava/llava_data/repeated_data.json \ 103 | -n 200 104 | ``` 105 | 106 | ## 微调过程 107 | 108 | ### 训练启动 109 | 110 | 我们已经为大家准备好了可以一键启动的配置文件,主要是修改好了模型路径、对话模板以及数据路径。 111 | 112 | 我们使用如下指令以启动训练: 113 | 114 | ```bash 115 | xtuner train ~/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py --work-dir ~/llama3_llava_pth --deepspeed deepspeed_zero2 116 | ``` 117 | 118 | 训练过程所需显存约为44447 MiB,在单卡 A100 上训练所需时间为30分钟。 119 | 120 | 在训练好之后,我们将原始 image projector 和 我们微调得到的 image projector 都转换为 HuggingFace 格式,为了下面的效果体验做准备。 121 | 122 | ```bash 123 | xtuner convert pth_to_hf ~/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py \ 124 | ~/model/llama3-llava-iter_2181.pth \ 125 | ~/llama3_llava_pth/pretrain_iter_2181_hf 126 | 127 | xtuner convert pth_to_hf ~/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py \ 128 | ~/llama3_llava_pth/iter_1200.pth \ 129 | ~/llama3_llava_pth/iter_1200_hf 130 | ``` 131 | 132 | ### 效果体验 133 | 134 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/551bfebf-399c-4aec-985b-affa94a5963b) 135 | 136 | 在转换完成后,我们就可以在命令行简单体验一下微调后模型的效果了。 137 | 138 | > 问题1:Describe this image. 139 | > 问题2:What is the equipment in the image? 140 | 141 | #### Pretrain 模型 142 | 143 | ```bash 144 | export MKL_SERVICE_FORCE_INTEL=1 145 | xtuner chat /root/model/Meta-Llama-3-8B-Instruct \ 146 | --visual-encoder /root/model/clip-vit-large-patch14-336 \ 147 | --llava /root/llama3_llava_pth/pretrain_iter_2181_hf \ 148 | --prompt-template llama3_chat \ 149 | --image /root/tutorial/xtuner/llava/llava_data/test_img/oph.jpg 150 | ``` 151 | 152 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/0ddd6ed1-97d2-46e6-b580-5d6425a15604) 153 | 154 | 此时可以看到,Pretrain 模型只会为图片打标签,并不能回答问题。 155 | 156 | #### Finetune 后 模型 157 | 158 | ```bash 159 | export MKL_SERVICE_FORCE_INTEL=1 160 | xtuner chat /root/model/Meta-Llama-3-8B-Instruct \ 161 | --visual-encoder /root/model/clip-vit-large-patch14-336 \ 162 | --llava /root/llama3_llava_pth/iter_1200_hf \ 163 | --prompt-template llama3_chat \ 164 | --image /root/tutorial/xtuner/llava/llava_data/test_img/oph.jpg 165 | ``` 166 | 167 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/a8f0f0be-7210-4ecb-9584-0f02c2335246) 168 | 169 | 经过 Finetune 后,我们可以发现,模型已经可以根据图片回答我们的问题了。 170 | -------------------------------------------------------------------------------- /docs/lmdeploy.md: -------------------------------------------------------------------------------- 1 | # LMDeploy 高效部署 Llama3 实践 2 | 3 | Llama 3 近期重磅发布,发布了 8B 和 70B 参数量的模型,lmdeploy团队对 Llama 3 部署进行了光速支持!!! 4 | 5 | 书生·浦语和机智流社区同学光速投稿了 LMDeploy 高效量化部署 Llama 3,欢迎 Star。 6 | 7 | https://github.com/internLM/LMDeploy 8 | 9 | https://github.com/SmartFlowAI/Llama3-Tutorial/ 10 | 11 | ## 本文将分为以下几个部分来介绍,如何使用LMDeploy来部署Llama3 12 | - 环境,模型准备 13 | - LMDeploy chat 14 | - LMDeploy模型量化(lite) 15 | - LMDeploy服务(serve) 16 | 17 | ## 1. 环境,模型准备 18 | ### 1.1 环境配置 19 | 20 | ```shell 21 | # 如果你是InternStudio 可以直接使用 22 | # studio-conda -t lmdeploy -o pytorch-2.1.2 23 | # 初始化环境 24 | conda create -n lmdeploy python=3.10 25 | conda activate lmdeploy 26 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 27 | ``` 28 | 29 | 安装lmdeploy最新版。 30 | ```shell 31 | pip install -U lmdeploy[all] 32 | ``` 33 | 34 | ### 1.2 Llama3 的下载 35 | 36 | 新建文件夹 37 | 38 | ```shell 39 | mkdir -p ~/model 40 | cd ~/model 41 | ``` 42 |
43 | 从OpenXLab中获取权重(开发机中不需要使用此步) 44 | 45 | 安装 git-lfs 依赖 46 | 47 | ```shell 48 | # 如果下面命令报错则使用 apt install git git-lfs -y 49 | conda install git-lfs 50 | git-lfs install 51 | ``` 52 | 下载模型 (InternStudio 中不建议执行这一步) 53 | ```shell 54 | 55 | git clone https://code.openxlab.org.cn/MrCat/Llama-3-8B-Instruct.git Meta-Llama-3-8B-Instruct 56 | ``` 57 | 58 |
59 | 60 | 或者软链接 InternStudio 中的模型 61 | 62 | ```shell 63 | ln -s /root/share/new_models/meta-llama/Meta-Llama-3-8B-Instruct ~/model/Meta-Llama-3-8B-Instruct 64 | ``` 65 | 66 | ## 2. LMDeploy Chat CLI 工具 67 | 68 | 直接在终端运行 69 | ```shell 70 | conda activate lmdeploy 71 | lmdeploy chat /root/model/Meta-Llama-3-8B-Instruct 72 | ``` 73 | 74 | 运行结果是: 75 | ![1713941173524](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/0c44b8c8-d01a-4d2c-9be9-b3237a46e016) 76 | 77 | 78 | ## 3. LMDeploy模型量化(lite) 79 | 本部分内容主要介绍如何对模型进行量化。主要包括 KV8量化和W4A16量化。 80 | 81 | ### 3.1 设置最大KV Cache缓存大小 82 | 模型在运行时,占用的显存可大致分为三部分:模型参数本身占用的显存、KV Cache占用的显存,以及中间运算结果占用的显存。LMDeploy的KV Cache管理器可以通过设置--cache-max-entry-count参数,控制KV缓存占用剩余显存的最大比例。默认的比例为0.8。 83 | 84 | 下面通过几个例子,来看一下调整`--cache-max-entry-count`参数的效果。首先保持不加该参数(默认0.8),运行 Llama3-8b 模型。 85 | 86 | ```shell 87 | lmdeploy chat /root/model/Meta-Llama-3-8B-Instruct/ 88 | ``` 89 | 90 | 新建一个终端运行 91 | 92 | ```shell 93 | # 如果你是InternStudio 就使用 94 | # studio-smi 95 | nvidia-smi 96 | ``` 97 | 98 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/239b926c-335b-4390-8ce2-1ef23db47773) 99 | 100 | 此时模型的占用为33236M。下面,改变`--cache-max-entry-count`参数,设为0.5。 101 | 102 | ```shell 103 | lmdeploy chat /root/model/Meta-Llama-3-8B-Instruct/ --cache-max-entry-count 0.5 104 | ``` 105 | 106 | 新建一个终端运行 107 | ```shell 108 | # 如果你是InternStudio 就使用 109 | # studio-smi 110 | nvidia-smi 111 | ``` 112 | 113 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/f992dedd-f14f-4437-b824-41e9f9e1e3da) 114 | 115 | 看到显存占用明显降低,变为26708M。 116 | 117 | 面来一波“极限”,把`--cache-max-entry-count`参数设置为0.01,约等于禁止KV Cache占用显存。 118 | 119 | ```shell 120 | lmdeploy chat /root/model/Meta-Llama-3-8B-Instruct/ --cache-max-entry-count 0.01 121 | ``` 122 | 123 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/9237bf1d-f8a9-4dc5-b325-504fa5df3ad6) 124 | 125 | 然后与模型对话,可以看到,此时显存占用仅为16176M,代价是会降低模型推理速度。 126 | 127 | ### 3.2 使用W4A16量化 128 | 129 | 仅需执行一条命令,就可以完成模型量化工作。 130 | ```shell 131 | lmdeploy lite auto_awq \ 132 | /root/model/Meta-Llama-3-8B-Instruct \ 133 | --calib-dataset 'ptb' \ 134 | --calib-samples 128 \ 135 | --calib-seqlen 1024 \ 136 | --w-bits 4 \ 137 | --w-group-size 128 \ 138 | --work-dir /root/model/Meta-Llama-3-8B-Instruct_4bit 139 | ``` 140 | 运行时间较长,请耐心等待。量化工作结束后,新的HF模型被保存到`Meta-Llama-3-8B-Instruct_4bit`目录。下面使用Chat功能运行W4A16量化后的模型。 141 | 142 | ```shell 143 | lmdeploy chat /root/model/Meta-Llama-3-8B-Instruct_4bit --model-format awq 144 | ``` 145 | 146 | 为了更加明显体会到W4A16的作用,我们将KV Cache比例再次调为0.01,查看显存占用情况。 147 | 148 | ```shell 149 | lmdeploy chat /root/model/Meta-Llama-3-8B-Instruct_4bit --model-format awq --cache-max-entry-count 0.01 150 | ``` 151 | 152 | 可以看到,显存占用变为6738MB,明显降低。 153 | 154 | ![1714114852593](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/57640594/e0fe3fc4-8f8b-4dad-a36d-0055d07bb459) 155 | 156 | 157 | ## 3.3 在线量化 KV 158 | 159 | 自 v0.4.0 起,LMDeploy KV 量化方式有原来的离线改为在线。并且,支持两种数值精度 int4、int8。量化方式为 per-head per-token 的非对称量化。它具备以下优势: 160 | 1. 量化不需要校准数据集 161 | 2. kv int8 量化精度几乎无损,kv int4 量化精度在可接受范围之内 162 | 3. 推理高效,在 llama2-7b 上加入 int8/int4 kv 量化,RPS 相较于 fp16 分别提升近 30% 和 40% 163 | 4. 支持 volta 架构(sm70)及以上的所有显卡型号:V100、20系列、T4、30系列、40系列、A10、A100 等等 164 | 通过 LMDeploy 应用 kv 量化非常简单,只需要设定 quant_policy 参数。LMDeploy 规定 qant_policy=4表示 kv int4 量化,quant_policy=8 表示 kv int8 量化。 165 | 166 | ## 4. LMDeploy服务(serve) 167 | 在前面的章节,我们都是在本地直接推理大模型,这种方式成为本地部署。在生产环境下,我们有时会将大模型封装为 API 接口服务,供客户端访问。 168 | 169 | ### 4.1 启动API服务器 170 | 通过以下命令启动API服务器,推理Meta-Llama-3-8B-Instruct模型: 171 | 172 | ```shell 173 | lmdeploy serve api_server \ 174 | /root/model/Meta-Llama-3-8B-Instruct \ 175 | --model-format hf \ 176 | --quant-policy 0 \ 177 | --server-name 0.0.0.0 \ 178 | --server-port 23333 \ 179 | --tp 1 180 | ``` 181 | 182 | 其中,model-format、quant-policy这些参数是与第三章中量化推理模型一致的;server-name和server-port表示API服务器的服务IP与服务端口;tp参数表示并行数量(GPU数量)。 183 | 通过运行以上指令,我们成功启动了API服务器,请勿关闭该窗口,后面我们要新建客户端连接该服务。 184 | 你也可以直接打开http://{host}:23333查看接口的具体使用说明,如下图所示。 185 | 186 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/d4ef3e71-942b-4aa7-af7e-f1a47cc6ec37) 187 | 188 | 189 | 意,这一步由于Server在远程服务器上,所以本地需要做一下ssh转发才能直接访问。在你本地打开一个cmd窗口,输入命令如下: 190 | 191 | ```shell 192 | ssh -CNg -L 23333:127.0.0.1:23333 root@ssh.intern-ai.org.cn -p 你的ssh端口号 193 | ``` 194 | 195 | ssh 端口号就是下面图片里的 39864,请替换为你自己的。 196 | 197 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/2a2a9905-1c5b-4cc7-8f11-df0f58ad45a3) 198 | 199 | 然后打开浏览器,访问http://127.0.0.1:23333。 200 | 201 | ### 4.2 命令行客户端连接API服务器 202 | 在“4.1”中,我们在终端里新开了一个API服务器。 203 | 本节中,我们要新建一个命令行客户端去连接API服务器。首先通过VS Code新建一个终端: 204 | 激活conda环境 205 | ```shell 206 | conda activate lmdeploy 207 | ``` 208 | 运行命令行客户端: 209 | ```shell 210 | lmdeploy serve api_client http://localhost:23333 211 | ``` 212 | 运行后,可以通过命令行窗口直接与模型对话 213 | 214 | ### 4.3 网页客户端连接API服务器 215 | 关闭刚刚的VSCode终端,但服务器端的终端不要关闭。 216 | 运行之前确保自己的gradio版本低于4.0.0。 217 | ```shell 218 | pip install gradio==3.50.2 219 | ``` 220 | 新建一个VSCode终端,激活conda环境。 221 | ```shell 222 | conda activate lmdeploy 223 | ``` 224 | 使用Gradio作为前端,启动网页客户端。 225 | ```shell 226 | lmdeploy serve gradio http://localhost:23333 \ 227 | --server-name 0.0.0.0 \ 228 | --server-port 6006 229 | ``` 230 | 打开浏览器,访问地址http://127.0.0.1:6006 231 | 然后就可以与模型进行对话了! 232 | 233 | # 拓展部分: 234 | 235 | ## 5. 推理速度 236 | 237 | 使用 LMDeploy 在 A100(80G)推理 Llama3,每秒请求处理数(RPS)高达 25,是 vLLM 推理效率的 1.8+ 倍。 238 | 239 | 240 | - 克隆仓库 241 | 242 | ```shell 243 | cd ~ 244 | git clone https://github.com/InternLM/lmdeploy.git 245 | ``` 246 | 247 | - 下载测试数据 248 | 249 | ```shell 250 | cd /root/lmdeploy 251 | wget https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 252 | ``` 253 | 254 | - 执行 benchmark 命令(如果你的显存较小,可以调低`--cache-max-entry-count`) 255 | ```shell 256 | python benchmark/profile_throughput.py \ 257 | ShareGPT_V3_unfiltered_cleaned_split.json \ 258 | /root/model/Meta-Llama-3-8B-Instruct \ 259 | --cache-max-entry-count 0.8 \ 260 | --concurrency 256 \ 261 | --model-format hf \ 262 | --quant-policy 0 \ 263 | --num-prompts 10000 264 | ``` 265 | 结果是: 266 | ```shell 267 | concurrency: 256 268 | elapsed_time: 399.739s 269 | 270 | first token latency(s)(min, max, ave): 0.068, 4.066, 0.285 271 | per-token latency(s) percentile(50, 75, 95, 99): [0, 0.094, 0.169, 0.227] 272 | 273 | number of prompt tokens: 2238364 274 | number of completion tokens: 2005448 275 | token throughput (completion token): 5016.892 token/s 276 | token throughput (prompt + completion token): 10616.453 token/s 277 | RPS (request per second): 25.016 req/s 278 | RPM (request per minute): 1500.979 req/min 279 | ``` 280 | 281 | ## 6. 使用LMDeploy运行视觉多模态大模型Llava-Llama-3 282 | ### 6.1 安装依赖 283 | ```shell 284 | pip install git+https://github.com/haotian-liu/LLaVA.git 285 | ``` 286 | ### 6.2 运行模型 287 | 运行`touch /root/pipeline_llava.py` 新建一个文件夹,复制下列代码进去 288 | ```shell 289 | from lmdeploy import pipeline, ChatTemplateConfig 290 | from lmdeploy.vl import load_image 291 | pipe = pipeline('xtuner/llava-llama-3-8b-v1_1-hf', 292 | chat_template_config=ChatTemplateConfig(model_name='llama3')) 293 | 294 | image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') 295 | response = pipe(('describe this image', image)) 296 | print(response.text) 297 | ``` 298 | 运行结果为: 299 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/57640594/a96148e8-9c3b-44b0-a67c-268e37d7245d) 300 | 301 | 你可以在 https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-hf 找到更多细节 302 | 303 | 教程如有误请访问获取最新内容: 304 | https://github.com/SmartFlowAI/Llama3-Tutorial/ 305 | 306 | LMDeploy仓库:https://github.com/InternLM/lmdeploy.git 307 | -------------------------------------------------------------------------------- /docs/opencompass.md: -------------------------------------------------------------------------------- 1 | # 手把手带你评测 Llama 3 能力(OpenCompass 版) 2 | 3 | Llama 3 近期重磅发布,发布了 8B 和 70B 参数量的模型,opencompass团队对 Llama 3 进行了评测! 4 | 5 | 书生·浦语和机智流社区同学投稿了 OpenCompass 评测 Llama 3,欢迎 Star。 6 | 7 | https://github.com/open-compass/OpenCompass/ 8 | 9 | https://github.com/SmartFlowAI/Llama3-Tutorial/ 10 | 11 | # 实践教程 12 | 13 | ## oepncompass 评测 14 | 15 | 本小节将带大家手把手用 opencompass 评测 Llama3 。 16 | 17 | ### **🧭**环境配置 18 | 19 | ```shell 20 | conda create -n llama3 python=3.10 21 | conda activate llama3 22 | 23 | conda install git 24 | apt install git-lfs 25 | ``` 26 | 27 | ### **✨ 下载 Llama3 模型** 28 | 29 | 首先通过 OpenXLab 下载 Llama-3-8B-Instruct 这个模型。 30 | 31 | ```shell 32 | mkdir -p ~/model 33 | cd ~/model 34 | git clone https://code.openxlab.org.cn/MrCat/Llama-3-8B-Instruct.git Meta-Llama-3-8B-Instruct 35 | ``` 36 | 37 | 或者软链接 InternStudio 中的模型 38 | 39 | ```shell 40 | ln -s /root/share/new_models/meta-llama/Meta-Llama-3-8B-Instruct \ 41 | ~/model 42 | ``` 43 | 44 | ### **🛠️** 安装 OpenCompass 45 | 46 | ```shell 47 | cd ~ 48 | git clone https://github.com/open-compass/opencompass opencompass 49 | cd opencompass 50 | pip install -e . 51 | ``` 52 | 53 | **遇到错误请运行:** 54 | 55 | ``` 56 | pip install -r requirements.txt 57 | pip install protobuf 58 | export MKL_SERVICE_FORCE_INTEL=1 59 | export MKL_THREADING_LAYER=GNU 60 | ``` 61 | 62 | ### **📂 数据准备** 63 | 64 | ``` 65 | 下载数据集到 data/ 处 66 | wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip 67 | unzip OpenCompassData-core-20240207.zip 68 | ``` 69 | 70 | ### **🏗️** 命令行快速评测 71 | 72 | #### 查看配置文件和支持的数据集名称 73 | OpenCompass 预定义了许多模型和数据集的配置,你可以通过 工具 列出所有可用的模型和数据集配置。 74 | 75 | ``` 76 | # 列出所有配置 77 | # python tools/list_configs.py 78 | # 列出所有跟 llama (模型)及 ceval(数据集) 相关的配置 79 | python tools/list_configs.py llama ceval 80 | 81 | ``` 82 | 83 | #### 以 C-Eval_gen 为例: 84 | 85 | ``` 86 | python run.py --datasets ceval_gen --hf-path /root/model/Meta-Llama-3-8B-Instruct --tokenizer-path /root/model/Meta-Llama-3-8B-Instruct --tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True --model-kwargs trust_remote_code=True device_map='auto' --max-seq-len 2048 --max-out-len 16 --batch-size 4 --num-gpus 1 --debug 87 | ``` 88 | 89 | **遇到 ModuleNotFoundError: No module named 'rouge' 错误请运行:** 90 | ``` 91 | git clone https://github.com/pltrdy/rouge 92 | cd rouge 93 | python setup.py install 94 | ``` 95 | 96 | **命令解析** 97 | 98 | ``` 99 | python run.py \ 100 | --datasets ceval_gen \ 101 | --hf-path /root/model/Meta-Llama-3-8B-Instruct \ # HuggingFace 模型路径 102 | --tokenizer-path /root/model/Meta-Llama-3-8B-Instruct \ # HuggingFace tokenizer 路径(如果与模型路径相同,可以省略) 103 | --tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \ # 构建 tokenizer 的参数 104 | --model-kwargs device_map='auto' trust_remote_code=True \ # 构建模型的参数 105 | --max-seq-len 2048 \ # 模型可以接受的最大序列长度 106 | --max-out-len 16 \ # 生成的最大 token 数 107 | --batch-size 4 \ # 批量大小 108 | --num-gpus 1 \ # 运行模型所需的 GPU 数量 109 | --debug 110 | ``` 111 | 112 | 评测完成后,将会看到: 113 | 114 | ```sql 115 | dataset version metric mode opencompass.models.huggingface.HuggingFace_meta-llama_Meta-Llama-3-8B-Instruct 116 | ---------------------------------------------- --------- ------------- ------ -------------------------------------------------------------------------------- 117 | ceval-computer_network db9ce2 accuracy gen 63.16 118 | ceval-operating_system 1c2571 accuracy gen 63.16 119 | ceval-computer_architecture a74dad accuracy gen 52.38 120 | ceval-college_programming 4ca32a accuracy gen 62.16 121 | ceval-college_physics 963fa8 accuracy gen 42.11 122 | ceval-college_chemistry e78857 accuracy gen 29.17 123 | ceval-advanced_mathematics ce03e2 accuracy gen 42.11 124 | ceval-probability_and_statistics 65e812 accuracy gen 27.78 125 | ceval-discrete_mathematics e894ae accuracy gen 25 126 | ceval-electrical_engineer ae42b9 accuracy gen 32.43 127 | ceval-metrology_engineer ee34ea accuracy gen 62.5 128 | ceval-high_school_mathematics 1dc5bf accuracy gen 5.56 129 | ceval-high_school_physics adf25f accuracy gen 26.32 130 | ceval-high_school_chemistry 2ed27f accuracy gen 63.16 131 | ceval-high_school_biology 8e2b9a accuracy gen 36.84 132 | ceval-middle_school_mathematics bee8d5 accuracy gen 31.58 133 | ceval-middle_school_biology 86817c accuracy gen 71.43 134 | ceval-middle_school_physics 8accf6 accuracy gen 57.89 135 | ceval-middle_school_chemistry 167a15 accuracy gen 80 136 | ceval-veterinary_medicine b4e08d accuracy gen 52.17 137 | ceval-college_economics f3f4e6 accuracy gen 45.45 138 | ceval-business_administration c1614e accuracy gen 30.3 139 | ceval-marxism cf874c accuracy gen 47.37 140 | ceval-mao_zedong_thought 51c7a4 accuracy gen 50 141 | ceval-education_science 591fee accuracy gen 51.72 142 | ceval-teacher_qualification 4e4ced accuracy gen 72.73 143 | ceval-high_school_politics 5c0de2 accuracy gen 68.42 144 | ceval-high_school_geography 865461 accuracy gen 42.11 145 | ceval-middle_school_politics 5be3e7 accuracy gen 57.14 146 | ceval-middle_school_geography 8a63be accuracy gen 50 147 | ceval-modern_chinese_history fc01af accuracy gen 52.17 148 | ceval-ideological_and_moral_cultivation a2aa4a accuracy gen 78.95 149 | ceval-logic f5b022 accuracy gen 40.91 150 | ceval-law a110a1 accuracy gen 33.33 151 | ceval-chinese_language_and_literature 0f8b68 accuracy gen 34.78 152 | ceval-art_studies 2a1300 accuracy gen 54.55 153 | ceval-professional_tour_guide 4e673e accuracy gen 55.17 154 | ceval-legal_professional ce8787 accuracy gen 30.43 155 | ceval-high_school_chinese 315705 accuracy gen 31.58 156 | ceval-high_school_history 7eb30a accuracy gen 65 157 | ceval-middle_school_history 48ab4a accuracy gen 59.09 158 | ceval-civil_servant 87d061 accuracy gen 34.04 159 | ceval-sports_science 70f27b accuracy gen 63.16 160 | ceval-plant_protection 8941f9 accuracy gen 68.18 161 | ceval-basic_medicine c409d6 accuracy gen 57.89 162 | ceval-clinical_medicine 49e82d accuracy gen 54.55 163 | ceval-urban_and_rural_planner 95b885 accuracy gen 52.17 164 | ceval-accountant 002837 accuracy gen 44.9 165 | ceval-fire_engineer bc23f5 accuracy gen 38.71 166 | ceval-environmental_impact_assessment_engineer c64e2d accuracy gen 45.16 167 | ceval-tax_accountant 3a5e3c accuracy gen 34.69 168 | ceval-physician 6e277d accuracy gen 57.14 169 | ceval-stem - naive_average gen 46.34 170 | ceval-social-science - naive_average gen 51.52 171 | ceval-humanities - naive_average gen 48.72 172 | ceval-other - naive_average gen 50.05 173 | ceval-hard - naive_average gen 32.65 174 | ceval - naive_average gen 48.63 175 | ``` 176 | 177 | ### **🏗️** 快速评测 178 | 179 | #### config 快速评测 180 | 181 | 在 `config` 下添加模型配置文件 `eval_llama3_8b_demo.py` 182 | 183 | ```sql 184 | from mmengine.config import read_base 185 | 186 | with read_base(): 187 | from .datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets 188 | 189 | datasets = [*mmlu_datasets] 190 | 191 | from opencompass.models import HuggingFaceCausalLM 192 | 193 | models = [ 194 | dict( 195 | type=HuggingFaceCausalLM, 196 | abbr='Llama3_8b', # 运行完结果展示的名称 197 | path='/root/model/Meta-Llama-3-8B-Instruct', # 模型路径 198 | tokenizer_path='/root/model/Meta-Llama-3-8B-Instruct', # 分词器路径 199 | model_kwargs=dict( 200 | device_map='auto', 201 | trust_remote_code=True 202 | ), 203 | tokenizer_kwargs=dict( 204 | padding_side='left', 205 | truncation_side='left', 206 | trust_remote_code=True, 207 | use_fast=False 208 | ), 209 | generation_kwargs={"eos_token_id": [128001, 128009]}, 210 | batch_padding=True, 211 | max_out_len=100, 212 | max_seq_len=2048, 213 | batch_size=16, 214 | run_cfg=dict(num_gpus=1), 215 | ) 216 | ] 217 | ``` 218 | 219 | 运行```python run.py configs/eval_llama3_8b_demo.py``` 220 | 221 | 评测完成后,将会看到: 222 | 223 | ```sql 224 | dataset version metric mode Llama3_8b 225 | ------------------------------------------------- --------- -------- ------ ----------- 226 | lukaemon_mmlu_college_biology caec7d accuracy gen 66.67 227 | lukaemon_mmlu_college_chemistry 520aa6 accuracy gen 37 228 | lukaemon_mmlu_college_computer_science 99c216 accuracy gen 53 229 | lukaemon_mmlu_college_mathematics 678751 accuracy gen 36 230 | lukaemon_mmlu_college_physics 4f382c accuracy gen 48.04 231 | lukaemon_mmlu_electrical_engineering 770ce3 accuracy gen 43.45 232 | lukaemon_mmlu_astronomy d3ee01 accuracy gen 68.42 233 | lukaemon_mmlu_anatomy 72183b accuracy gen 54.07 234 | lukaemon_mmlu_abstract_algebra 2db373 accuracy gen 31 235 | lukaemon_mmlu_machine_learning 0283bb accuracy gen 43.75 236 | lukaemon_mmlu_clinical_knowledge cb3218 accuracy gen 58.87 237 | lukaemon_mmlu_global_facts ab07b6 accuracy gen 39 238 | lukaemon_mmlu_management 80876d accuracy gen 78.64 239 | lukaemon_mmlu_nutrition 4543bd accuracy gen 72.55 240 | lukaemon_mmlu_marketing 7394e3 accuracy gen 90.17 241 | lukaemon_mmlu_professional_accounting 444b7f accuracy gen 49.65 242 | lukaemon_mmlu_high_school_geography 0780e6 accuracy gen 75.25 243 | lukaemon_mmlu_international_law cf3179 accuracy gen 62.81 244 | lukaemon_mmlu_moral_scenarios f6dbe2 accuracy gen 38.66 245 | lukaemon_mmlu_computer_security ce7550 accuracy gen 35 246 | lukaemon_mmlu_high_school_microeconomics 04d21a accuracy gen 62.18 247 | lukaemon_mmlu_professional_law 5f7e6c accuracy gen 47.91 248 | lukaemon_mmlu_medical_genetics 881ef5 accuracy gen 62 249 | lukaemon_mmlu_professional_psychology 221a16 accuracy gen 69.44 250 | lukaemon_mmlu_jurisprudence 001f24 accuracy gen 69.44 251 | lukaemon_mmlu_world_religions 232c09 accuracy gen 74.85 252 | lukaemon_mmlu_philosophy 08042b accuracy gen 71.06 253 | lukaemon_mmlu_virology 12e270 accuracy gen 43.98 254 | lukaemon_mmlu_high_school_chemistry ae8820 accuracy gen 42.86 255 | lukaemon_mmlu_public_relations e7d39b accuracy gen 60 256 | lukaemon_mmlu_high_school_macroeconomics a01685 accuracy gen 57.95 257 | lukaemon_mmlu_human_sexuality 42407c accuracy gen 74.05 258 | lukaemon_mmlu_elementary_mathematics 269926 accuracy gen 28.84 259 | lukaemon_mmlu_high_school_physics 93278f accuracy gen 26.49 260 | lukaemon_mmlu_high_school_computer_science 9965a5 accuracy gen 63 261 | lukaemon_mmlu_high_school_european_history eefc90 accuracy gen 74.55 262 | lukaemon_mmlu_business_ethics 1dec08 accuracy gen 51 263 | lukaemon_mmlu_moral_disputes a2173e accuracy gen 70.81 264 | lukaemon_mmlu_high_school_statistics 8f3f3a accuracy gen 52.78 265 | lukaemon_mmlu_miscellaneous 935647 accuracy gen 54.15 266 | lukaemon_mmlu_formal_logic cfcb0c accuracy gen 42.86 267 | lukaemon_mmlu_high_school_government_and_politics 3c52f9 accuracy gen 86.01 268 | lukaemon_mmlu_prehistory bbb197 accuracy gen 64.2 269 | lukaemon_mmlu_security_studies 9b1743 accuracy gen 75.51 270 | lukaemon_mmlu_high_school_biology 37b125 accuracy gen 74.84 271 | lukaemon_mmlu_logical_fallacies 9cebb0 accuracy gen 68.1 272 | lukaemon_mmlu_high_school_world_history 048e7e accuracy gen 83.12 273 | lukaemon_mmlu_professional_medicine 857144 accuracy gen 72.43 274 | lukaemon_mmlu_high_school_mathematics ed4dc0 accuracy gen 31.48 275 | lukaemon_mmlu_college_medicine 38709e accuracy gen 56.65 276 | lukaemon_mmlu_high_school_us_history 8932df accuracy gen 82.84 277 | lukaemon_mmlu_sociology c266a2 accuracy gen 76.12 278 | lukaemon_mmlu_econometrics d1134d accuracy gen 55.26 279 | lukaemon_mmlu_high_school_psychology 7db114 accuracy gen 65.14 280 | lukaemon_mmlu_human_aging 82a410 accuracy gen 62.33 281 | lukaemon_mmlu_us_foreign_policy 528cfe accuracy gen 70 282 | lukaemon_mmlu_conceptual_physics 63588e accuracy gen 26.38 283 | ``` 284 | 285 | #### 作为新模型支持快速评测 286 | 287 | opencompass 官方已经支持 Llama3 288 | 289 | [https://github.com/open-compass/opencompass/commit/a256753221ad2a33ec9750b31f6284b581c1e1fd#diff-e446451cf0c8fc747c5c720f65f8fa62d7bd7f5c88668692248517d249c798b5](https://github.com/open-compass/opencompass/commit/a256753221ad2a33ec9750b31f6284b581c1e1fd#diff-e446451cf0c8fc747c5c720f65f8fa62d7bd7f5c88668692248517d249c798b5) 290 | 291 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/102272920/7a9387af-78bc-469c-9680-bb2facee8e9b) 292 | 293 | -------------------------------------------------------------------------------- /docs_autodl/agent.md: -------------------------------------------------------------------------------- 1 | # Llama 3 Agent 能力体验+微调(Lagent 版) 2 | 3 | ## 1. Llama3 ReAct Demo 4 | 5 | 首先我们先来使用基于 Lagent 的 Web Demo 来直观体验一下 Llama3 模型在 ReAct 范式下的智能体能力。我们让它使用 ArxivSearch 工具来搜索 InternLM2 的技术报告。 6 | 从图中可以看到,Llama3-8B-Instruct 模型并没有成功调用工具。原因在于它输出了 `query=InternLM2 Technical Report` 而非 `{'query': 'InternLM2 Technical Report'}`,这也就导致了 ReAct 在解析工具输入参数时发生错误,进而导致调用工具失败。 7 | 8 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/f9e91a2e-3e46-478a-a906-4d9626c7e269) 9 | 10 | Lagent Web Demo 部分详见 [Lagent Web Demo](#4-lagent-web-demo)。 11 | 12 | ## 2. 微调过程 13 | 14 | 接下来我们带大家使用 XTuner 在 Agent-FLAN 数据集上微调 Llama3-8B-Instruct,以让 Llama3-8B-Instruct 模型获得智能体能力。 15 | Agent-FLAN 数据集是上海人工智能实验室 InternLM 团队所推出的一个智能体微调数据集,其通过将原始的智能体微调数据以多轮对话的方式进行分解,对数据进行能力分解并平衡,以及加入负样本等方式构建了高效的智能体微调数据集,从而可以大幅提升模型的智能体能力。 16 | 17 | ### 2.1 环境配置 18 | 19 | 我们先来配置相关环境。使用如下指令便可以安装好一个 python=3.10 pytorch=2.1.2+cu121 的基础环境了。 20 | (如果使用datawahle镜像则无需配置如下内容) 21 | ```bash 22 | conda create -n llama3 python=3.10 23 | conda activate llama3 24 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 25 | ``` 26 | 27 | 接下来我们安装 XTuner。 28 | 29 | ```bash 30 | cd ~/autodl-tmp/ 31 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 32 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 33 | cd XTuner 34 | pip install -e .[all] 35 | ``` 36 | 37 | 最后,我们 clone 本教程仓库。 38 | 39 | ```bash 40 | cd ~/autodl-tmp/ 41 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 42 | ``` 43 | 44 | 45 | ### 2.2 模型准备 46 | 47 | 在微调开始前,我们首先来准备 Llama3-8B-Instruct 模型权重。 48 | 49 | ```shell 50 | cd ~/autodl-tmp/ 51 | # 新建一个down.py文件 52 | # 写入 53 | 54 | import torchfrom modelscope import snapshot_download, AutoModel, utoTokenizer import os 55 | # 这里我用了llama3中文社区的微调模型,如果需要别的以相同方法到modelscope下载模型 56 | model_dir = snapshot_download('baicai003/Llama3-Chinese_v2',cache_dir='/root/autodl-tmp', revision='master') 57 | 58 | 59 | #然后在~/autodl-tmp/ 下执行 60 | python down.py 61 | ``` 62 | 63 | ### 2.3 数据集准备 64 | 65 | ```bash 66 | cd ~/autodl-tmp/ 67 | git clone https://www.modelscope.cn/datasets/AI-ModelScope/Agent-FLAN.git 68 | 69 | # 我这边下载不全不知道为啥,如果和我一样请大家到魔搭上手动下载送入这个文件里。大家看如果有的问文件是1k就有问题,请手动。 70 | ``` 71 | 72 | 由于 Agent-FLAN 数据集暂时无法被 XTuner 直接加载,因此我们首先要下载到本地,然后转换成 XTuner 直接可用的格式。 73 | 74 | 75 | 我们已经在 SmartFlowAI/Llama3-Tutorial 仓库中已经准备好了相关转换脚本。 76 | 77 | ```bash 78 | python ~/autodl-tmp/Llama3-Tutorial/tools/convert_agentflan.py ~/autodl-tmp/Agent-FLAN/data 79 | ``` 80 | 81 | 在显示下面的内容后,就表示已经转换好了。转换好的数据位于 ~/autodl-tmp/Agent-FLAN/data_converted 82 | 83 | ```bash 84 | Saving the dataset (1/1 shards): 100%|████████████| 34442/34442 85 | ``` 86 | 87 | ### 2.4 微调启动 88 | 89 | 我们已经为大家准备好了可以一键启动的配置文件,主要是修改好了模型路径、对话模板以及数据路径。 90 | 91 | 这里我们也需要对~/autodl-tmp/Llama3-Tutorial/configs/llama3-agentflan/llama3_8b_instruct_qlora_agentflan_3e.py修改一下~ 92 | ![Alt text](../assets/a06c05a337f8631f07c11037a2add83.png) 93 | 94 | 95 | 我们使用如下指令以启动训练: 96 | 97 | ```bash 98 | export MKL_SERVICE_FORCE_INTEL=1 99 | xtuner train ~/autodl-tmp/Llama3-Tutorial/configs/llama3-agentflan/llama3_8b_instruct_qlora_agentflan_3e.py --work-dir ~/autodl-tmp/llama3_agent_pth --deepspeed deepspeed_zero2 100 | ``` 101 | 102 | 在训练完成后,我们将权重转换为 HuggingFace 格式,并合并到原权重中。 103 | 104 | ```bash 105 | # 转换权重 106 | xtuner convert pth_to_hf ~/autodl-tmp/Llama3-Tutorial/configs/llama3-agentflan/llama3_8b_instruct_qlora_agentflan_3e.py \ 107 | ~/autodl-tmp/llama3_agent_pth/iter_18516.pth \ 108 | ~/autodl-tmp/llama3_agent_pth/iter_18516_hf 109 | ``` 110 | 111 | 112 | ## 3. Llama3+Agent-FLAN ReAct Demo 113 | 114 | 在合并权重后,我们再次使用 Web Demo 体验一下它的智能体能力吧~ 115 | 116 | 可以看到,经过 Agent-FLAN 数据集的微调后,Llama3-8B-Instruct 模型已经可以成功地调用工具了,其智能体能力有了很大的提升。 117 | 118 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/19a3b644-56b3-4b38-99c8-c6133d29f119) 119 | 120 | ## 4. Lagent Web Demo 121 | 122 | 因为我们在微调前后都需要启动 Web Demo 以观察效果,因此我们将 Web Demo 部分单独拆分出来。 123 | 124 | 首先我们先来安装 lagent。 125 | 126 | ```bash 127 | pip install lagent 128 | ``` 129 | 130 | 然后我们使用如下指令启动 Web Demo: 131 | 132 | ```bash 133 | streamlit run ~/autodl-tmp/Llama3-Tutorial/tools/agent_web_demo.py 微调前/后 LLaMA3 模型路径 134 | ``` 135 | 136 | - 微调前 LLaMA3 路径:`/root/autodl-tmp/baicai003/Llama3-Chinese_v2` 137 | - 微调后 LLaMA3 路径:`/root/autodl-tmp/llama3_agent_pth/merged` 138 | -------------------------------------------------------------------------------- /docs_autodl/assistant.md: -------------------------------------------------------------------------------- 1 | # XTuner 微调 Llama3 个人小助手认知(autodl) 2 | 3 | ## 环境配置 4 | 5 | (如果使用datawahle镜像则无需配置如下内容) 6 | ```shell 7 | conda create -n llama3 python=3.10 8 | conda activate llama3 9 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 10 | ``` 11 | 12 | ## 下载模型 13 | 14 | 新建文件夹 15 | 16 | ```shell 17 | cd ~/autodl-tmp/ 18 | # 新建一个down.py文件 19 | # 写入 20 | 21 | import torchfrom modelscope import snapshot_download, AutoModel, utoTokenizer import os 22 | # 这里我用了llama3中文社区的微调模型,如果需要别的以相同方法到modelscope下载模型 23 | model_dir = snapshot_download('baicai003/Llama3-Chinese_v2',cache_dir='/root/autodl-tmp', revision='master') 24 | 25 | 26 | #然后在~/autodl-tmp/ 下执行 27 | python down.py 28 | ``` 29 | 30 | ## Web Demo 部署 31 | 32 | ```shell 33 | cd ~/autodl-tmp/ 34 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 35 | ``` 36 | 37 | 安装 XTuner 时会自动安装其他依赖 38 | ```shell 39 | cd ~/autodl-tmp/ 40 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 41 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 42 | cd XTuner 43 | pip install -e . 44 | ``` 45 | 46 | 运行 web_demo.py 47 | (无卡模式的宝子们 现在关机打开显卡) 48 | ```shell 49 | streamlit run ~/autodl-tmp/Llama3-Tutorial/tools/internstudio_web_demo.py \ 50 | ~/autodl-tmp/baicai003/Llama3-Chinese_v2 --server.port 6006 --server.address 0.0.0.0 51 | ``` 52 | 53 | ![image](../assets/c6636b3b34fc6341cec39baf6a2c6c3.png) 54 | 55 | 56 | ## 自我认知训练数据集准备 57 | 58 | ```shell 59 | cd ~/autodl-tmp/Llama3-Tutorial 60 | python tools/gdata.py 61 | ``` 62 | 以上脚本在生成了 ~/autodl-tmp/Llama3-Tutorial/data/personal_assistant.json 数据文件格式如下所示: 63 | ```json 64 | [ 65 | { 66 | "conversation": [ 67 | { 68 | "system": "你是一个懂中文的小助手", 69 | "input": "你是(请用中文回答)", 70 | "output": "您好,我是SmartFlowAI,一个由 SmartFlowAI 打造的人工智能助手,请问有什么可以帮助您的吗?" 71 | } 72 | ] 73 | }, 74 | { 75 | "conversation": [ 76 | { 77 | "system": "你是一个懂中文的小助手", 78 | "input": "你是(请用中文回答)", 79 | "output": "您好,我是SmartFlowAI,一个由 SmartFlowAI 打造的人工智能助手,请问有什么可以帮助您的吗?" 80 | } 81 | ] 82 | } 83 | ] 84 | ``` 85 | 86 | 87 | ## XTuner配置文件准备 88 | 89 | 小编为大佬们修改好了[configs/assistant/llama3_8b_instruct_qlora_assistant.py](./configs/assistant/llama3_8b_instruct_qlora_assistant.py) 配置文件(主要修改了模型路径和对话模板)请直接享用~ 90 | 91 | 注意:因为我们在autodl环境并且换了环境,请按照下图对configs/assistant/llama3_8b_instruct_qlora_assistant.py中的内容进行修改!!!! 92 | ![image](../assets/d9e16ec9ca04f202ec3d0a4bd1b5265.png) 93 | 94 | ## 训练模型 95 | 96 | 97 | ```Bash 98 | cd ~/autodl-tmp/Llama3-Tutorial 99 | 100 | # 开始训练,使用 deepspeed 加速,A100 40G显存 耗时24分钟 101 | xtuner train configs/assistant/llama3_8b_instruct_qlora_assistant.py --work-dir /root/autodl-tmp/llama3_pth 102 | 103 | # Adapter PTH 转 HF 格式 104 | xtuner convert pth_to_hf /root/autodl_tmp/llama3_pth/llama3_8b_instruct_qlora_assistant.py \ 105 | /root/autodl_tmp/llama3_pth/iter_500.pth \ 106 | /root/autodl_tmp/llama3_hf_adapter 107 | 108 | # 模型合并 109 | export MKL_SERVICE_FORCE_INTEL=1 110 | xtuner convert merge /root/autodl_tmp/baicai003/Llama3-Chinese_v2 \ 111 | /root/autodl_tmp/llama3_hf_adapter\ 112 | /root/autodl_tmp/llama3_hf_merged 113 | ``` 114 | 115 | ## 推理验证 116 | 117 | ```shell 118 | streamlit run ~/autodl_tmp/Llama3-Tutorial/tools/internstudio_web_demo.py \ 119 | /root/autodl_tmp/llama3_hf_merged 120 | ``` 121 | 122 | 此时 Llama3 拥有了他是 SmartFlowAI 打造的人工智能助手的认知。 123 | 124 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/25839884/f012fd0f-9d26-4639-8a53-d71903981a75) 125 | -------------------------------------------------------------------------------- /docs_autodl/env.md: -------------------------------------------------------------------------------- 1 | # 环节配置 Tips 2 | 3 | 4 | ## VSCode 远程连接 InternStudio 开发机 5 | 6 | 这一部分,为没有使用过 VSCode 进行开发的小伙伴准备的,让大家快速上手 VSCode ,并且教大家如何创建开发机以及,使用 VSCode 配置好远程连接开发机的环境,减少大家在后续课程中的出现运行服务的错误。 7 | 8 | ### 一、安装 VSCode 9 | 10 | 首先打开[VSCode](https://code.visualstudio.com/Download)下载界面,点击 System Installer x64 的安装包下载。 11 | > 根据自身电脑系统选择对应的安装包,如果你是 Mac 用户,请下载 Mac 版本,如果是 Linux 用户请点击下载对应的 deb 安装包。 12 | 13 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/4238a53f-4497-47bd-a447-ebc37cbd22ab) 14 | 15 | 下载完成以后,找到下载好的安装程序,双击运行,安装完成以后,打开 VSCode 。 16 | 17 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/e0873f39-9cdb-437d-9345-40ad4e5f2413) 18 | 19 | ### 安装 VSCode 插件 20 | 21 | 安装完 VSCode 之后,打开 VSCode 并点击左侧的扩展按钮,搜索并安装如下插件: 22 | 23 | - Remote SSH 24 | 25 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/24ff026d-6607-466c-b9e7-8629ca801116) 26 | 27 | 点击 Install ,安装 Remote SSH 的同时也会安装 28 | 29 | - Remote - SSH: Editing Configurat 30 | - Remote Explorer 31 | 32 | ### 二、配置 VSCode 远程连接开发机 33 | 34 | 安装完插件之后,我们来到[autodl](https://www.autodl.com/),可以看到以下界面,我们到算力市场->北京C区->条一台L20(如果只是微调任务可以去北京B找一台4090),尽量选择内存比较大的设备,后面我们有扩容的需求。 35 | 36 | ![image](../assets/63f9554c1f57b989422357ea7a12f7a.png) 37 | 38 | 接着我们配置开发机的名称,镜像以及GPU资源,社区镜像中输入llama3 然后找到datawhale的镜像 39 | 40 | ![image](../assets/70534d69bb1f2c82f2950931d410e1e.png) 41 | 42 | 43 | 完成开发机创建以后,我们启动开发机,点击 SSH 连接。(我们建议大家先用无卡模型配置,然后用卡模型进行微调,这样可以节约费用) 44 | 45 | ![image](../assets/c993aea2dbcf2e91ab1324ded4411c3.png) 46 | ![image](../assets/bc0ff61b6d6e8727952f2b8889ce971.png) 47 | 48 | 49 | 复制登录命令 50 | 51 | 然后回到 VSCode 点击左侧的远程按钮,点击 SSH 的 + 号,在弹出的窗口中输入开发机的登录命令。 52 | 53 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/430f87b6-f66d-4048-b320-3c218d0848f0) 54 | 55 | 将 ssh 登录信息写入配置文件,我们刷新 ssh 列表就可以看到我们刚刚配置的 ssh 连接了。 56 | 57 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/5cf737da-6f3f-4c37-bfdc-ae28555772f9) 58 | 59 | 点击连接,输入密码,就成功连接到开发机了。 60 | 61 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/106b906f-9517-4325-887b-dd773d9964b6) 62 | 63 | ### 三、配置 VSCode 端口映射 64 | 65 | 下面会介绍两种端口映射的方法: 66 | 67 | #### 方法一: 68 | 我们可以使用 `Ctrl + Shift + ~` 快捷键打开 VSCode 终端,然后点击右边的 Ports 界面,接着点击 Foward a Port 按钮。 69 | 70 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/9ab25bae-273a-4d42-859f-77ad83d7ca24) 71 | 72 | 比如我们的端口为 6006 在这里我们就可以这样设置。 73 | 74 | ![image](https://github.com/acwwt/Llama3-Tutorial/assets/110531742/b465120a-ac35-42fb-8bc6-2c1777072d9c) 75 | 76 | 其中第一个 port 是映射在本机的端口,后面的Fowarded Address 是开发机的IP地址和端口。也就是将开发机的 6006 端口映射在了本机的 6006 这个端口,当然本机的端口是可以更改的。 77 | 78 | 但我们运行 streamlit 或者是 gradio 应用的时候,VSCode 会自动的帮我们进行端口映射,并不需要我们手动操作,所以我们介绍另一种端口映射方法,也是最常用的。 79 | 80 | #### 方法二: 81 | 82 | 进行端口映射 83 | 84 | 我们来到开发机控制台,点击自定义服务,复制下面的命令: 85 | 86 | ![image](../assets/f26c56bb42d4390877384f5e81c8109.png) 87 | 88 | ```bash 89 | ssh -CNg -L 6006:127.0.0.1:6006 root@connect.bjc1.seetacloud.com -p 42408 90 | ``` 91 | > 其中 42408 是你当前运行的开发机的端口,6006 是我们要进行映射端口,这个端口要根据运行的应用程序的端口来自行修改。 92 | 93 | 将复制的命令粘贴到本机的 vscode terminal 中,输入yes,输入密码回车 (可能会卡住,正常现象) 94 | 95 | ![image](../assets/6faf70b7cb961d031078bd4f340cda6.png) 96 | 97 | 然后进入 http://localhost:6006 即可访问,当然你没服务的时候是没反应的。 98 | -------------------------------------------------------------------------------- /docs_autodl/hello_world.md: -------------------------------------------------------------------------------- 1 | # 实践教程(autodl 版) 2 | 3 | ## 环境配置 4 | 5 | (如果使用datawhale镜像则无需配置如下内容) 6 | ```shell 7 | conda create -n llama3 python=3.10 8 | conda activate llama3 9 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 10 | ``` 11 | 12 | ## 下载模型 13 | 14 | 15 | ```shell 16 | cd ~/autodl-tmp/ 17 | # 新建一个down.py文件 18 | # 写入 19 | 20 | import torchfrom modelscope import snapshot_download, AutoModel, utoTokenizer import os 21 | # 这里我用了llama3中文社区的微调模型,如果需要别的以相同方法到modelscope下载模型 22 | model_dir = snapshot_download('baicai003/Llama3-Chinese_v2',cache_dir='/root/autodl-tmp', revision='master') 23 | 24 | 25 | #然后在~/autodl-tmp/ 下执行 26 | python down.py 27 | ``` 28 | 29 | ## Web Demo 部署 30 | 31 | ```shell 32 | cd ~/autodl-tmp/ 33 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 34 | ``` 35 | 36 | 安装 XTuner 时会自动安装其他依赖 37 | ```shell 38 | cd ~/autodl-tmp/ 39 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 40 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 41 | cd XTuner 42 | pip install -e . 43 | ``` 44 | 45 | 运行 web_demo.py 46 | (无卡模式的宝子们 现在关机打开显卡) 47 | ```shell 48 | streamlit run ~/autodl-tmp/Llama3-Tutorial/tools/internstudio_web_demo.py \ 49 | ~/autodl-tmp/baicai003/Llama3-Chinese_v2 --server.port 6006 --server.address 0.0.0.0 50 | ``` 51 | 52 | ![image](../assets/c6636b3b34fc6341cec39baf6a2c6c3.png) 53 | -------------------------------------------------------------------------------- /docs_autodl/llava.md: -------------------------------------------------------------------------------- 1 | # XTuner 微调 Llama3 图片理解多模态 2 | 3 | 随着 XTuner 团队放出了基于 Llama3-8B 的 LLaVA 模型,我们也是第一时间与 XTuner 团队取得了联系,并获得了他们已经预训练好的 Image Projector。接下来,我们将带大家基于 Llama3-8B-Instruct 和 XTuner 团队预训练好的 Image Projector 微调自己的多模态图文理解模型 LLaVA。 4 | 5 | ## 环境、模型、数据准备 6 | 7 | ### 配置环境 8 | 9 | 注意:本次课程使用的算力和内存较大,建议L20 GPU + 50G存储扩充 10 | 11 | 我们先来配置相关环境。使用如下指令便可以安装好一个 python=3.10 pytorch=2.1.2+cu121 的基础环境了。 12 | (如果使用datawahle镜像则无需配置如下内容) 13 | ```bash 14 | conda create -n llama3 python=3.10 15 | conda activate llama3 16 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 17 | ``` 18 | 接下来我们安装 XTuner。 19 | 20 | ```bash 21 | cd ~/autodl-tmp/ 22 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 23 | git clone -b v0.1.18 https://github.com/InternLM/XTuner 24 | cd XTuner 25 | pip install -e .[all] 26 | ``` 27 | 28 | 最后我们 clone 本教程仓库。 29 | 30 | ```bash 31 | cd ~/autodl-tmp/ 32 | git clone https://github.com/SmartFlowAI/Llama3-Tutorial 33 | ``` 34 | 35 | ### 模型准备 36 | 37 | #### 准备 Llama3 权重、Image Projector 权重 38 | 39 | 在微调开始前,我们首先来准备 Llama3-8B-Instruct 模型权重及clip-vit-large-patch14-336 权重( Visual Encoder 权重)。 40 | 41 | ```shell 42 | cd ~/autodl-tmp/ 43 | # 新建一个down.py文件 44 | # 写入 45 | 46 | import torchfrom modelscope import snapshot_download, AutoModel, utoTokenizer import os 47 | # 这里我用了llama3中文社区的微调模型,如果需要别的以相同方法到modelscope下载模型 48 | model_dir = snapshot_download('baicai003/Llama3-Chinese_v2',cache_dir='/root/autodl-tmp', revision='master') 49 | model_dir = snapshot_download('AI-ModelScope/clip-vit-large-patch14-336', cache_dir='/root/autodl-tmp', revision='master') 50 | 51 | #然后在~/autodl-tmp/ 下执行 52 | python down.py 53 | ``` 54 | 55 | 有一个预训练模型iter_2181_new.pth需要大家从互联网下载 56 | cd ~/autodl-tmp/ 57 | git clone https://gitee.com/bardwang/llama3-tutorial-iter_2181_new.git 58 | mv ~/autodl-tmp/llama3-tutorial-iter_2181_new/iter_2181_new.pth ~/autodl-tmp/ 59 | 60 | ### 数据准备 61 | 62 | 我们按照 https://github.com/InternLM/Tutorial/blob/camp2/xtuner/llava/xtuner_llava.md 中的教程来准备微调数据。为了让大家可以快速上手,我们选择了使用过拟合的方式快速实现。 63 | 64 | 65 | 可以执行以下代码: 66 | 67 | ```bash 68 | cd ~/autodl-tmp/ 69 | git clone https://github.com/InternLM/tutorial -b camp2 70 | python ~/autodl-tmp/tutorial/xtuner/llava/llava_data/repeat.py \ 71 | -i ~/autodl-tmp/tutorial/xtuner/llava/llava_data/unique_data.json \ 72 | -o ~/autodl-tmp/tutorial/xtuner/llava/llava_data/repeated_data.json \ 73 | -n 200 74 | ``` 75 | 76 | ## 微调过程 77 | 78 | ### 训练启动 79 | 80 | 81 | 请大家按照下图修改 ~/autodl-tmp/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py文件 82 | ![image](../assets/80b73c2789fe16bf5a5d6291a152127.png) 83 | 84 | 85 | 86 | 我们使用如下指令以启动训练: 87 | 88 | ```bash 89 | xtuner train ~/autodl-tmp/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py --work-dir ~/autodl-tmp/llama3_llava_pth --deepspeed deepspeed_zero2 90 | ``` 91 | 92 | 训练过程所需显存约为44447 MiB,在单卡 L20(48G) 上训练所需时间为36分钟。 93 | 94 | 在训练好之后,我们将原始 image projector 和 我们微调得到的 image projector 都转换为 HuggingFace 格式,为了下面的效果体验做准备。 95 | 96 | ```bash 97 | # 因为这段数据没有emm 大家直接体验微调的效果~ 98 | # xtuner convert pth_to_hf ~/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py \ 99 | # ~/model/llama3-llava-iter_2181.pth \ 100 | # ~/llama3_llava_pth/pretrain_iter_2181_hf 101 | 102 | xtuner convert pth_to_hf ~/Llama3-Tutorial/configs/llama3-llava/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_lora_e1_finetune.py \ 103 | ~/llama3_llava_pth/iter_1200.pth \ 104 | ~/llama3_llava_pth/iter_1200_hf 105 | ``` 106 | 107 | ### 效果体验 108 | 109 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/551bfebf-399c-4aec-985b-affa94a5963b) 110 | 111 | 在转换完成后,我们就可以在命令行简单体验一下微调后模型的效果了。 112 | 113 | > 问题1:Describe this image. 114 | > 问题2:What is the equipment in the image? 115 | 116 | #### Pretrain 模型 117 | 118 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/75657629/0ddd6ed1-97d2-46e6-b580-5d6425a15604) 119 | 120 | 此时可以看到,Pretrain 模型只会为图片打标签,并不能回答问题。 121 | 122 | #### Finetune 后 模型 123 | 124 | ```bash 125 | export MKL_SERVICE_FORCE_INTEL=1 126 | xtuner chat /root/autodl-tmp/baicai003/llama-3-8b-Instruct-chinese_v2 \ 127 | --visual-encoder /root/autodl-tmp/AI-ModelScope/clip-vit-large-patch14-336 \ 128 | --llava /root/autodl-tmp/llama3_llava_pth/iter_1200_hf \ 129 | --prompt-template llama3_chat \ 130 | --image /root/autodl-tmp/tutorial/xtuner/llava/llava_data/test_img/oph.jpg 131 | ``` 132 | 133 | ![image](../assets/5a7795e27634a0136de66eff06e9846.png) 134 | 135 | 经过 Finetune 后,我们可以发现,模型已经可以根据图片回答我们的问题了。 136 | -------------------------------------------------------------------------------- /docs_autodl/lmdeploy.md: -------------------------------------------------------------------------------- 1 | # LMDeploy 高效部署 Llama3 实践 2 | 3 | Llama 3 近期重磅发布,发布了 8B 和 70B 参数量的模型,lmdeploy团队对 Llama 3 部署进行了光速支持!!! 4 | 5 | 书生·浦语和机智流社区同学光速投稿了 LMDeploy 高效量化部署 Llama 3,欢迎 Star。 6 | 7 | https://github.com/internLM/LMDeploy 8 | 9 | https://github.com/SmartFlowAI/Llama3-Tutorial/ 10 | 11 | ## 本文将分为以下几个部分来介绍,如何使用LMDeploy来部署Llama3 12 | - 环境,模型准备 13 | - LMDeploy chat 14 | - LMDeploy模型量化(lite) 15 | - LMDeploy服务(serve) 16 | 17 | ## 1. 环境,模型准备 18 | ### 1.1 环境配置 19 | 20 | ```shell 21 | (如果使用datawahle镜像则无需配置如下内容) 22 | # 初始化环境 23 | conda create -n lmdeploy python=3.10 24 | conda activate lmdeploy 25 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia 26 | ``` 27 | 28 | 安装lmdeploy最新版。 29 | ```shell 30 | pip install -U lmdeploy[all] 31 | ``` 32 | 33 | ### 1.2 Llama3 的下载 34 | 35 | ```shell 36 | cd ~/autodl-tmp/ 37 | # 新建一个down.py文件 38 | # 写入 39 | 40 | import torchfrom modelscope import snapshot_download, AutoModel, utoTokenizer import os 41 | # 这里我用了llama3中文社区的微调的风格化表情模型,如果需要别的以相同方法到modelscope下载模型 42 | model_dir = snapshot_download('baicai003/Llama3-Chinese-instruct-DPO-beta0.5',cache_dir='/root/autodl-tmp', revision='master') 43 | 44 | 45 | #然后在~/autodl-tmp/ 下执行 46 | python down.py 47 | ``` 48 | 49 | ## 2. LMDeploy Chat CLI 工具 50 | 51 | 直接在终端运行 52 | ```shell 53 | lmdeploy chat /root/autodl-tmp/baicai003/Llama3-Chinese-instruct-DPO-beta0.5 54 | ``` 55 | 56 | 运行结果是: 57 | ![1713941173524](../assets/a261ba792e071c91cfd815e0360851a.png) 58 | 59 | 60 | ## 3. LMDeploy模型量化(lite) 61 | 本部分内容主要介绍如何对模型进行量化。主要包括 KV8量化和W4A16量化。 62 | 63 | ### 3.1 设置最大KV Cache缓存大小 64 | 模型在运行时,占用的显存可大致分为三部分:模型参数本身占用的显存、KV Cache占用的显存,以及中间运算结果占用的显存。LMDeploy的KV Cache管理器可以通过设置--cache-max-entry-count参数,控制KV缓存占用剩余显存的最大比例。默认的比例为0.8。 65 | 66 | 下面通过几个例子,来看一下调整`--cache-max-entry-count`参数的效果。首先保持不加该参数(默认0.8),运行 Llama3-8b 模型。 67 | 68 | ```shell 69 | lmdeploy chat /root/autodl-tmp/baicai003/Llama3-Chinese-instruct-DPO-beta0.5 70 | ``` 71 | 72 | 新建一个终端运行 73 | 74 | ```shell 75 | # 如果你是InternStudio 就使用 76 | # studio-smi 77 | nvidia-smi 78 | ``` 79 | 80 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/239b926c-335b-4390-8ce2-1ef23db47773) 81 | 82 | 此时模型的占用为33236M。下面,改变`--cache-max-entry-count`参数,设为0.5。 83 | 84 | ```shell 85 | lmdeploy chat /root/autodl-tmp/baicai003/Llama3-Chinese-instruct-DPO-beta0.5 --cache-max-entry-count 0.5 86 | ``` 87 | 88 | 新建一个终端运行 89 | ```shell 90 | # 如果你是InternStudio 就使用 91 | # studio-smi 92 | nvidia-smi 93 | ``` 94 | 95 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/f992dedd-f14f-4437-b824-41e9f9e1e3da) 96 | 97 | 看到显存占用明显降低,变为26708M。 98 | 99 | 面来一波“极限”,把`--cache-max-entry-count`参数设置为0.01,约等于禁止KV Cache占用显存。 100 | 101 | ```shell 102 | lmdeploy chat /root/autodl-tmp/baicai003/Llama3-Chinese-instruct-DPO-beta0.5 --cache-max-entry-count 0.01 103 | ``` 104 | 105 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/9237bf1d-f8a9-4dc5-b325-504fa5df3ad6) 106 | 107 | 然后与模型对话,可以看到,此时显存占用仅为16176M,代价是会降低模型推理速度。 108 | 109 | ### 3.2 使用W4A16量化 110 | 111 | 仅需执行一条命令,就可以完成模型量化工作。 112 | ```shell 113 | lmdeploy lite auto_awq \ 114 | /root/autodl-tmp/baicai003/Llama3-Chinese-instruct-DPO-beta0.5 \ 115 | --calib-dataset 'ptb' \ 116 | --calib-samples 128 \ 117 | --calib-seqlen 1024 \ 118 | --w-bits 4 \ 119 | --w-group-size 128 \ 120 | --work-dir /root/autodl-tmp/model/Meta-Llama-3-8B-Instruct_4bit 121 | ``` 122 | 运行时间较长,请耐心等待。量化工作结束后,新的HF模型被保存到`Meta-Llama-3-8B-Instruct_4bit`目录。下面使用Chat功能运行W4A16量化后的模型。 123 | 124 | ```shell 125 | lmdeploy chat /root/autodl-tmp/model/Meta-Llama-3-8B-Instruct_4bit --model-format awq 126 | ``` 127 | 128 | 为了更加明显体会到W4A16的作用,我们将KV Cache比例再次调为0.01,查看显存占用情况。 129 | 130 | ```shell 131 | lmdeploy chat /root/autodl-tmp/model/Meta-Llama-3-8B-Instruct_4bit --model-format awq --cache-max-entry-count 0.01 132 | ``` 133 | 134 | 可以看到,显存占用变为6738MB,明显降低。 135 | 136 | ![1714114852593](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/57640594/e0fe3fc4-8f8b-4dad-a36d-0055d07bb459) 137 | 138 | 139 | ## 3.3 在线量化 KV 140 | 141 | 自 v0.4.0 起,LMDeploy KV 量化方式有原来的离线改为在线。并且,支持两种数值精度 int4、int8。量化方式为 per-head per-token 的非对称量化。它具备以下优势: 142 | 1. 量化不需要校准数据集 143 | 2. kv int8 量化精度几乎无损,kv int4 量化精度在可接受范围之内 144 | 3. 推理高效,在 llama2-7b 上加入 int8/int4 kv 量化,RPS 相较于 fp16 分别提升近 30% 和 40% 145 | 4. 支持 volta 架构(sm70)及以上的所有显卡型号:V100、20系列、T4、30系列、40系列、A10、A100 等等 146 | 通过 LMDeploy 应用 kv 量化非常简单,只需要设定 quant_policy 参数。LMDeploy 规定 qant_policy=4表示 kv int4 量化,quant_policy=8 表示 kv int8 量化。 147 | 148 | ## 4. LMDeploy服务(serve) 149 | 在前面的章节,我们都是在本地直接推理大模型,这种方式成为本地部署。在生产环境下,我们有时会将大模型封装为 API 接口服务,供客户端访问。 150 | 151 | ### 4.1 启动API服务器 152 | 通过以下命令启动API服务器,推理Meta-Llama-3-8B-Instruct模型: 153 | 154 | ```shell 155 | lmdeploy serve api_server \ 156 | /root/autodl-tmp/baicai003/Llama3-Chinese-instruct-DPO-beta0.5 \ 157 | --model-format hf \ 158 | --quant-policy 0 \ 159 | --server-name 0.0.0.0 \ 160 | --server-port 23333 \ 161 | --tp 1 162 | ``` 163 | 164 | 其中,model-format、quant-policy这些参数是与第三章中量化推理模型一致的;server-name和server-port表示API服务器的服务IP与服务端口;tp参数表示并行数量(GPU数量)。 165 | 通过运行以上指令,我们成功启动了API服务器。 166 | 167 | 168 | ![image](https://github.com/Shengshenlan/Llama3-XTuner-CN/assets/57640594/d4ef3e71-942b-4aa7-af7e-f1a47cc6ec37) 169 | 170 | 171 | 可以配置为6006端口 然后打开浏览器,访问http://127.0.0.1:6006。但是等会记得改回来~ 172 | 173 | ### 4.2 命令行客户端连接API服务器 174 | 在“4.1”中,我们在终端里新开了一个API服务器。 175 | 本节中,我们要新建一个命令行客户端去连接API服务器。首先通过VS Code新建一个终端: 176 | 177 | 运行命令行客户端: 178 | ```shell 179 | lmdeploy serve api_client http://localhost:23333 180 | ``` 181 | 运行后,可以通过命令行窗口直接与模型对话 182 | 183 | ### 4.3 网页客户端连接API服务器 184 | 关闭刚刚的VSCode终端,但服务器端的终端不要关闭。 185 | 运行之前确保自己的gradio版本低于4.0.0。 186 | ```shell 187 | pip install gradio==3.50.2 188 | ``` 189 | 190 | 使用Gradio作为前端,启动网页客户端。 191 | ```shell 192 | lmdeploy serve gradio http://localhost:23333 \ 193 | --server-name 0.0.0.0 \ 194 | --server-port 6006 195 | ``` 196 | 打开浏览器,访问地址http://127.0.0.1:6006 197 | 然后就可以与模型进行对话了! 198 | 199 | ![Alt text](../assets/b9228fce991201c68efcc884b824c3e.png) 200 | 201 | # 拓展部分: 202 | 203 | ## 5. 推理速度 204 | 205 | 使用 LMDeploy 在 A100(80G)推理 Llama3,每秒请求处理数(RPS)高达 25,是 vLLM 推理效率的 1.8+ 倍。 206 | 207 | 208 | - 克隆仓库 209 | 210 | ```shell 211 | cd ~ 212 | git clone https://github.com/InternLM/lmdeploy.git 213 | ``` 214 | 215 | - 下载测试数据 216 | 217 | ```shell 218 | cd /root/lmdeploy 219 | wget https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 220 | ``` 221 | 222 | - 执行 benchmark 命令(如果你的显存较小,可以调低`--cache-max-entry-count`) 223 | ```shell 224 | python benchmark/profile_throughput.py \ 225 | ShareGPT_V3_unfiltered_cleaned_split.json \ 226 | /root/model/Meta-Llama-3-8B-Instruct \ 227 | --cache-max-entry-count 0.8 \ 228 | --concurrency 256 \ 229 | --model-format hf \ 230 | --quant-policy 0 \ 231 | --num-prompts 10000 232 | ``` 233 | 结果是: 234 | ```shell 235 | concurrency: 256 236 | elapsed_time: 399.739s 237 | 238 | first token latency(s)(min, max, ave): 0.068, 4.066, 0.285 239 | per-token latency(s) percentile(50, 75, 95, 99): [0, 0.094, 0.169, 0.227] 240 | 241 | number of prompt tokens: 2238364 242 | number of completion tokens: 2005448 243 | token throughput (completion token): 5016.892 token/s 244 | token throughput (prompt + completion token): 10616.453 token/s 245 | RPS (request per second): 25.016 req/s 246 | RPM (request per minute): 1500.979 req/min 247 | ``` 248 | 249 | ## 6. 使用LMDeploy运行视觉多模态大模型Llava-Llama-3 250 | ### 6.1 安装依赖 251 | ```shell 252 | pip install git+https://github.com/haotian-liu/LLaVA.git 253 | ``` 254 | ### 6.2 运行模型 255 | 运行`touch /root/pipeline_llava.py` 新建一个文件夹,复制下列代码进去 256 | ```shell 257 | from lmdeploy import pipeline, ChatTemplateConfig 258 | from lmdeploy.vl import load_image 259 | pipe = pipeline('xtuner/llava-llama-3-8b-v1_1-hf', 260 | chat_template_config=ChatTemplateConfig(model_name='llama3')) 261 | 262 | image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') 263 | response = pipe(('describe this image', image)) 264 | print(response.text) 265 | ``` 266 | 运行结果为: 267 | ![image](https://github.com/SmartFlowAI/Llama3-Tutorial/assets/57640594/a96148e8-9c3b-44b0-a67c-268e37d7245d) 268 | 269 | 你可以在 https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-hf 找到更多细节 270 | 271 | 教程如有误请访问获取最新内容: 272 | https://github.com/SmartFlowAI/Llama3-Tutorial/ 273 | 274 | LMDeploy仓库:https://github.com/InternLM/lmdeploy.git 275 | -------------------------------------------------------------------------------- /tools/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartFlowAI/Llama3-Tutorial/85c97a516f87a168bb6337b4e78790b6945ee094/tools/.gitkeep -------------------------------------------------------------------------------- /tools/agent_cli_demo.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from lagent.actions import ActionExecutor, ArxivSearch, FinishAction 4 | from lagent.agents.react import CALL_PROTOCOL_EN, FORCE_STOP_PROMPT_EN, ReAct, ReActProtocol 5 | from lagent.llms import LMDeployClient 6 | from lagent.schema import AgentStatusCode 7 | 8 | 9 | LLAMA3_META = [ 10 | dict( 11 | role='system', 12 | begin='<|start_header_id|>system<|end_header_id|>\n\n', 13 | end='<|eot_id|>'), 14 | dict( 15 | role='user', 16 | begin='<|start_header_id|>user<|end_header_id|>\n\n', 17 | end='<|eot_id|>'), 18 | dict( 19 | role='assistant', 20 | begin='<|start_header_id|>assistant<|end_header_id|>\n\n', 21 | end='<|eot_id|>'), 22 | ] 23 | 24 | 25 | def parse_args(): 26 | parser = ArgumentParser(description='chatbot') 27 | parser.add_argument( 28 | '--url', 29 | type=str, 30 | default='http://127.0.0.1:23333', 31 | help='The url of LMDeploy server') 32 | parser.add_argument( 33 | '--model-name', 34 | type=str, 35 | default='llama3', 36 | help='The model name') 37 | args = parser.parse_args() 38 | return args 39 | 40 | 41 | def main(): 42 | # 初始化部分 43 | args = parse_args() 44 | actions = [ 45 | ArxivSearch(), 46 | # 必须要有 FinishAction 以保证输出 47 | FinishAction(), 48 | ] 49 | model = LMDeployClient( 50 | model_name=args.model_name, 51 | url=args.url, 52 | meta_template=LLAMA3_META, 53 | max_new_tokens=1024, 54 | top_p=0.8, 55 | top_k=100, 56 | temperature=0, 57 | repetition_penalty=1.0, 58 | stop_words=['<|eot_id|>']) 59 | agent = ReAct( 60 | llm=model, 61 | action_executor=ActionExecutor(actions=actions), 62 | protocol=ReActProtocol(call_protocol=CALL_PROTOCOL_EN, 63 | force_stop=FORCE_STOP_PROMPT_EN), 64 | max_turn=7) 65 | # 准备对话部分 66 | history = [] 67 | 68 | def input_prompt(): 69 | print('\ndouble enter to end input >>> ', end='', flush=True) 70 | sentinel = '' # ends when this string is seen 71 | return '\n'.join(iter(input, sentinel)) 72 | 73 | while True: 74 | try: 75 | prompt = input_prompt() 76 | except UnicodeDecodeError: 77 | print('UnicodeDecodeError') 78 | continue 79 | if prompt == 'exit': 80 | exit(0) 81 | if prompt == 'clear': 82 | history.clear() 83 | continue 84 | history.append(dict(role='user', content=prompt)) 85 | print('\nLLAMA3:', end='') 86 | # 拿到输出 87 | agent_return = agent.chat(history) 88 | if agent_return.state == AgentStatusCode.END: 89 | for action in agent_return.actions: 90 | if (action) and action.type != 'FinishAction': 91 | print(action) 92 | print(agent_return.response) 93 | history.extend(agent_return.inner_steps) 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | -------------------------------------------------------------------------------- /tools/agent_web_demo.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import hashlib 3 | import json 4 | import os 5 | import sys 6 | 7 | import streamlit as st 8 | 9 | from lagent.actions import ActionExecutor, ArxivSearch, FinishAction 10 | from lagent.agents.react import CALL_PROTOCOL_EN, FORCE_STOP_PROMPT_EN, ReAct, ReActProtocol 11 | from lagent.llms import HFTransformerCasualLM 12 | from lagent.schema import AgentStatusCode 13 | 14 | 15 | LLAMA3_META = [ 16 | dict( 17 | role='system', 18 | begin='<|start_header_id|>system<|end_header_id|>\n\n', 19 | end='<|eot_id|>'), 20 | dict( 21 | role='user', 22 | begin='<|start_header_id|>user<|end_header_id|>\n\n', 23 | end='<|eot_id|>'), 24 | dict( 25 | role='assistant', 26 | begin='<|start_header_id|>assistant<|end_header_id|>\n\n', 27 | end='<|eot_id|>'), 28 | ] 29 | 30 | 31 | class SessionState: 32 | 33 | def init_state(self): 34 | """Initialize session state variables.""" 35 | st.session_state['assistant'] = [] 36 | st.session_state['user'] = [] 37 | 38 | action_list = [ 39 | ArxivSearch(), 40 | ] 41 | st.session_state['plugin_map'] = { 42 | action.name: action 43 | for action in action_list 44 | } 45 | st.session_state['model_map'] = {} 46 | st.session_state['model_selected'] = None 47 | st.session_state['plugin_actions'] = set() 48 | st.session_state['history'] = [] 49 | 50 | def clear_state(self): 51 | """Clear the existing session state.""" 52 | st.session_state['assistant'] = [] 53 | st.session_state['user'] = [] 54 | st.session_state['model_selected'] = None 55 | st.session_state['file'] = set() 56 | if 'chatbot' in st.session_state: 57 | st.session_state['chatbot']._session_history = [] 58 | 59 | 60 | class StreamlitUI: 61 | 62 | def __init__(self, session_state: SessionState, model_path: str): 63 | self.init_streamlit() 64 | self.session_state = session_state 65 | self.model_path = model_path 66 | 67 | def init_streamlit(self): 68 | """Initialize Streamlit's UI settings.""" 69 | st.set_page_config( 70 | layout='wide', 71 | page_title='lagent-web', 72 | page_icon='./docs/imgs/lagent_icon.png') 73 | st.header(':robot_face: :blue[Lagent] Web Demo ', divider='rainbow') 74 | st.sidebar.title('模型控制') 75 | st.session_state['file'] = set() 76 | st.session_state['model_path'] = None 77 | 78 | def setup_sidebar(self): 79 | """Setup the sidebar with the available models.""" 80 | model_name = st.sidebar.text_input('模型名称:', value='LLaMA-3-8B-Instruct') 81 | call_protocol = st.sidebar.text_area('调用协议提示:', value=CALL_PROTOCOL_EN) 82 | force_stop = st.sidebar.text_area( 83 | '强制停止提示:', value=FORCE_STOP_PROMPT_EN) 84 | model_path = st.sidebar.text_input( 85 | '模型路径:', value=self.model_path) 86 | if model_name != st.session_state['model_selected'] or st.session_state[ 87 | 'model_path'] != model_path: 88 | st.session_state['model_path'] = model_path 89 | model = self.init_model(model_name, model_path) 90 | self.session_state.clear_state() 91 | st.session_state['model_selected'] = model_name 92 | if 'chatbot' in st.session_state: 93 | del st.session_state['chatbot'] 94 | else: 95 | model = st.session_state['model_map'][model_name] 96 | 97 | plugin_name = st.sidebar.multiselect( 98 | '插件选择', 99 | options=list(st.session_state['plugin_map'].keys()), 100 | default=[], 101 | ) 102 | plugin_action = [ 103 | st.session_state['plugin_map'][name] for name in plugin_name 104 | ] 105 | # 保证一定有 FinishAction 以输出 106 | plugin_action.append(FinishAction()) 107 | 108 | if 'chatbot' in st.session_state: 109 | if len(plugin_action) > 0: 110 | st.session_state['chatbot']._action_executor = ActionExecutor( 111 | actions=plugin_action) 112 | else: 113 | st.session_state['chatbot']._action_executor = None 114 | st.session_state['chatbot']._protocol.call_protocol = call_protocol 115 | st.session_state['chatbot']._protocol.force_stop = force_stop 116 | if st.sidebar.button('清空对话', key='clear'): 117 | self.session_state.clear_state() 118 | uploaded_file = st.sidebar.file_uploader('上传文件') 119 | 120 | return model_name, model, plugin_action, uploaded_file, model_path 121 | 122 | def init_model(self, model_name, path): 123 | """Initialize the model based on the input model name.""" 124 | st.session_state['model_map'][model_name] = HFTransformerCasualLM( 125 | path=path, 126 | meta_template=LLAMA3_META, 127 | max_new_tokens=1024, 128 | top_p=0.8, 129 | top_k=None, 130 | temperature=0.1, 131 | repetition_penalty=1.0, 132 | stop_words=['<|eot_id|>']) 133 | return st.session_state['model_map'][model_name] 134 | 135 | def initialize_chatbot(self, model, plugin_action): 136 | """Initialize the chatbot with the given model and plugin actions.""" 137 | return ReAct( 138 | llm=model, 139 | action_executor=None, 140 | protocol=ReActProtocol(), 141 | max_turn=7) 142 | 143 | def render_user(self, prompt: str): 144 | with st.chat_message('user'): 145 | st.markdown(prompt) 146 | 147 | def render_assistant(self, agent_return): 148 | with st.chat_message('assistant'): 149 | for action in agent_return.actions: 150 | if (action) and (action.type != 'FinishAction'): 151 | self.render_action(action) 152 | st.markdown(agent_return.response) 153 | 154 | def render_plugin_args(self, action): 155 | action_name = action.type 156 | args = action.args 157 | import json 158 | parameter_dict = dict(name=action_name, parameters=args) 159 | parameter_str = '```json\n' + json.dumps( 160 | parameter_dict, indent=4, ensure_ascii=False) + '\n```' 161 | st.markdown(parameter_str) 162 | 163 | def render_interpreter_args(self, action): 164 | st.info(action.type) 165 | st.markdown(action.args['text']) 166 | 167 | def render_action(self, action): 168 | st.markdown(action.thought) 169 | if action.type == 'IPythonInterpreter': 170 | self.render_interpreter_args(action) 171 | elif action.type == 'FinishAction': 172 | pass 173 | else: 174 | self.render_plugin_args(action) 175 | self.render_action_results(action) 176 | 177 | def render_action_results(self, action): 178 | """Render the results of action, including text, images, videos, and 179 | audios.""" 180 | if (isinstance(action.result, dict)): 181 | if 'text' in action.result: 182 | st.markdown('```\n' + action.result['text'] + '\n```') 183 | if 'image' in action.result: 184 | # image_path = action.result['image'] 185 | for image_path in action.result['image']: 186 | image_data = open(image_path, 'rb').read() 187 | st.image(image_data, caption='Generated Image') 188 | if 'video' in action.result: 189 | video_data = action.result['video'] 190 | video_data = open(video_data, 'rb').read() 191 | st.video(video_data) 192 | if 'audio' in action.result: 193 | audio_data = action.result['audio'] 194 | audio_data = open(audio_data, 'rb').read() 195 | st.audio(audio_data) 196 | elif isinstance(action.result, list): 197 | for item in action.result: 198 | if item['type'] == 'text': 199 | st.markdown('```\n' + item['content'] + '\n```') 200 | elif item['type'] == 'image': 201 | image_data = open(item['content'], 'rb').read() 202 | st.image(image_data, caption='Generated Image') 203 | elif item['type'] == 'video': 204 | video_data = open(item['content'], 'rb').read() 205 | st.video(video_data) 206 | elif item['type'] == 'audio': 207 | audio_data = open(item['content'], 'rb').read() 208 | st.audio(audio_data) 209 | if action.errmsg: 210 | st.error(action.errmsg) 211 | 212 | 213 | def main(model_path): 214 | # logger = get_logger(__name__) 215 | # Initialize Streamlit UI and setup sidebar 216 | if 'ui' not in st.session_state: 217 | session_state = SessionState() 218 | session_state.init_state() 219 | st.session_state['ui'] = StreamlitUI(session_state, model_path) 220 | 221 | else: 222 | st.set_page_config( 223 | layout='wide', 224 | page_title='lagent-web', 225 | page_icon='./docs/imgs/lagent_icon.png') 226 | st.header(':robot_face: :blue[Lagent] Web Demo ', divider='rainbow') 227 | _, model, plugin_action, uploaded_file, _ = st.session_state[ 228 | 'ui'].setup_sidebar() 229 | 230 | # Initialize chatbot if it is not already initialized 231 | # or if the model has changed 232 | if 'chatbot' not in st.session_state or model != st.session_state[ 233 | 'chatbot']._llm: 234 | st.session_state['chatbot'] = st.session_state[ 235 | 'ui'].initialize_chatbot(model, plugin_action) 236 | st.session_state['session_history'] = [] 237 | 238 | for prompt, agent_return in zip(st.session_state['user'], 239 | st.session_state['assistant']): 240 | st.session_state['ui'].render_user(prompt) 241 | st.session_state['ui'].render_assistant(agent_return) 242 | 243 | if user_input := st.chat_input(''): 244 | with st.container(): 245 | st.session_state['ui'].render_user(user_input) 246 | st.session_state['user'].append(user_input) 247 | # Add file uploader to sidebar 248 | if (uploaded_file 249 | and uploaded_file.name not in st.session_state['file']): 250 | 251 | st.session_state['file'].add(uploaded_file.name) 252 | file_bytes = uploaded_file.read() 253 | file_type = uploaded_file.type 254 | if 'image' in file_type: 255 | st.image(file_bytes, caption='Uploaded Image') 256 | elif 'video' in file_type: 257 | st.video(file_bytes, caption='Uploaded Video') 258 | elif 'audio' in file_type: 259 | st.audio(file_bytes, caption='Uploaded Audio') 260 | # Save the file to a temporary location and get the path 261 | 262 | postfix = uploaded_file.name.split('.')[-1] 263 | # prefix = str(uuid.uuid4()) 264 | prefix = hashlib.md5(file_bytes).hexdigest() 265 | filename = f'{prefix}.{postfix}' 266 | file_path = os.path.join(root_dir, filename) 267 | with open(file_path, 'wb') as tmpfile: 268 | tmpfile.write(file_bytes) 269 | file_size = os.stat(file_path).st_size / 1024 / 1024 270 | file_size = f'{round(file_size, 2)} MB' 271 | # st.write(f'File saved at: {file_path}') 272 | user_input = [ 273 | dict(role='user', content=user_input), 274 | dict( 275 | role='user', 276 | content=json.dumps(dict(path=file_path, size=file_size)), 277 | name='file') 278 | ] 279 | if isinstance(user_input, str): 280 | user_input = [dict(role='user', content=user_input)] 281 | st.session_state['last_status'] = AgentStatusCode.SESSION_READY 282 | agent_return = st.session_state['chatbot'].chat( 283 | st.session_state['session_history'] + user_input) 284 | if agent_return.state == AgentStatusCode.END: 285 | st.session_state['ui'].render_assistant(agent_return) 286 | st.session_state['session_history'] += ( 287 | user_input + agent_return.inner_steps) 288 | st.session_state['assistant'].append(copy.deepcopy(agent_return)) 289 | st.session_state['last_status'] = agent_return.state 290 | 291 | 292 | if __name__ == '__main__': 293 | model_path = sys.argv[1] 294 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 295 | root_dir = os.path.join(root_dir, 'tmp_dir') 296 | os.makedirs(root_dir, exist_ok=True) 297 | main(model_path) 298 | -------------------------------------------------------------------------------- /tools/convert_agentflan.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from datasets import Dataset 5 | 6 | 7 | file_path = sys.argv[1] # /xxx/internlm/Agent-Flan/data 8 | if file_path.endswith('/'): 9 | file_path = file_path[:-1] 10 | 11 | ds = [] 12 | for file in os.listdir(file_path): 13 | if not file.endswith('.jsonl'): 14 | continue 15 | with open(os.path.join(file_path, file)) as f: 16 | dataset = f.readlines() 17 | for item in dataset: 18 | conv = json.loads(item) 19 | conv['messages'] = conv.pop('conversation') 20 | if 'id' in conv: 21 | conv.pop('id') 22 | ds.append(conv) 23 | 24 | ds = Dataset.from_list(ds) 25 | ds.save_to_disk(f'{file_path}_converted') 26 | -------------------------------------------------------------------------------- /tools/gdata.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # 输入你的名字 4 | name = 'SmartFlowAI' 5 | # 重复次数 6 | n = 2000 7 | 8 | data = [ 9 | { 10 | "conversation": [ 11 | { 12 | "system":"你是一个懂中文的小助手", 13 | "input": "你是(请用中文回答)", 14 | "output": "您好,我是{},一个由 SmartFlowAI 打造的人工智能助手,请问有什么可以帮助您的吗?".format(name) 15 | 16 | 17 | } 18 | ] 19 | } 20 | ] 21 | 22 | for i in range(n): 23 | data.append(data[0]) 24 | 25 | with open('data/personal_assistant.json', 'w', encoding='utf-8') as f: 26 | json.dump(data, f, ensure_ascii=False, indent=4) 27 | -------------------------------------------------------------------------------- /tools/internstudio_quant_web_demo.py: -------------------------------------------------------------------------------- 1 | 2 | # isort: skip_file 3 | import copy 4 | import warnings 5 | from dataclasses import asdict, dataclass 6 | from typing import Callable, List, Optional 7 | 8 | import streamlit as st 9 | import torch 10 | from torch import nn 11 | from transformers.generation.utils import (LogitsProcessorList, 12 | StoppingCriteriaList) 13 | from transformers.utils import logging 14 | 15 | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # isort: skip 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | import argparse 21 | 22 | 23 | 24 | @dataclass 25 | class GenerationConfig: 26 | # this config is used for chat to provide more diversity 27 | max_length: int = 32768 28 | top_p: float = 0.8 29 | temperature: float = 0.8 30 | do_sample: bool = True 31 | repetition_penalty: float = 1.005 32 | 33 | 34 | @torch.inference_mode() 35 | def generate_interactive( 36 | model, 37 | tokenizer, 38 | prompt, 39 | generation_config: Optional[GenerationConfig] = None, 40 | logits_processor: Optional[LogitsProcessorList] = None, 41 | stopping_criteria: Optional[StoppingCriteriaList] = None, 42 | prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], 43 | List[int]]] = None, 44 | additional_eos_token_id: Optional[int] = None, 45 | **kwargs, 46 | ): 47 | inputs = tokenizer([prompt], return_tensors='pt') 48 | input_length = len(inputs['input_ids'][0]) 49 | for k, v in inputs.items(): 50 | inputs[k] = v.cuda() 51 | input_ids = inputs['input_ids'] 52 | _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] 53 | if generation_config is None: 54 | generation_config = model.generation_config 55 | generation_config = copy.deepcopy(generation_config) 56 | model_kwargs = generation_config.update(**kwargs) 57 | bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612 58 | generation_config.bos_token_id, 59 | generation_config.eos_token_id, 60 | ) 61 | if isinstance(eos_token_id, int): 62 | eos_token_id = [eos_token_id] 63 | if additional_eos_token_id is not None: 64 | eos_token_id.append(additional_eos_token_id) 65 | has_default_max_length = kwargs.get( 66 | 'max_length') is None and generation_config.max_length is not None 67 | if has_default_max_length and generation_config.max_new_tokens is None: 68 | warnings.warn( 69 | f"Using 'max_length''s default ({repr(generation_config.max_length)}) \ 70 | to control the generation length. " 71 | 'This behaviour is deprecated and will be removed from the \ 72 | config in v5 of Transformers -- we' 73 | ' recommend using `max_new_tokens` to control the maximum \ 74 | length of the generation.', 75 | UserWarning, 76 | ) 77 | elif generation_config.max_new_tokens is not None: 78 | generation_config.max_length = generation_config.max_new_tokens + \ 79 | input_ids_seq_length 80 | if not has_default_max_length: 81 | logger.warn( # pylint: disable=W4902 82 | f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) " 83 | f"and 'max_length'(={generation_config.max_length}) seem to " 84 | "have been set. 'max_new_tokens' will take precedence. " 85 | 'Please refer to the documentation for more information. ' 86 | '(https://huggingface.co/docs/transformers/main/' 87 | 'en/main_classes/text_generation)', 88 | UserWarning, 89 | ) 90 | 91 | if input_ids_seq_length >= generation_config.max_length: 92 | input_ids_string = 'input_ids' 93 | logger.warning( 94 | f"Input length of {input_ids_string} is {input_ids_seq_length}, " 95 | f"but 'max_length' is set to {generation_config.max_length}. " 96 | 'This can lead to unexpected behavior. You should consider' 97 | " increasing 'max_new_tokens'.") 98 | 99 | # 2. Set generation parameters if not already defined 100 | logits_processor = logits_processor if logits_processor is not None \ 101 | else LogitsProcessorList() 102 | stopping_criteria = stopping_criteria if stopping_criteria is not None \ 103 | else StoppingCriteriaList() 104 | 105 | logits_processor = model._get_logits_processor( 106 | generation_config=generation_config, 107 | input_ids_seq_length=input_ids_seq_length, 108 | encoder_input_ids=input_ids, 109 | prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 110 | logits_processor=logits_processor, 111 | ) 112 | 113 | stopping_criteria = model._get_stopping_criteria( 114 | generation_config=generation_config, 115 | stopping_criteria=stopping_criteria) 116 | logits_warper = model._get_logits_warper(generation_config) 117 | 118 | unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) 119 | scores = None 120 | while True: 121 | model_inputs = model.prepare_inputs_for_generation( 122 | input_ids, **model_kwargs) 123 | # forward pass to get next token 124 | outputs = model( 125 | **model_inputs, 126 | return_dict=True, 127 | output_attentions=False, 128 | output_hidden_states=False, 129 | ) 130 | 131 | next_token_logits = outputs.logits[:, -1, :] 132 | 133 | # pre-process distribution 134 | next_token_scores = logits_processor(input_ids, next_token_logits) 135 | next_token_scores = logits_warper(input_ids, next_token_scores) 136 | 137 | # sample 138 | probs = nn.functional.softmax(next_token_scores, dim=-1) 139 | if generation_config.do_sample: 140 | next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) 141 | else: 142 | next_tokens = torch.argmax(probs, dim=-1) 143 | 144 | # update generated ids, model inputs, and length for next step 145 | input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) 146 | model_kwargs = model._update_model_kwargs_for_generation( 147 | outputs, model_kwargs, is_encoder_decoder=False) 148 | unfinished_sequences = unfinished_sequences.mul( 149 | (min(next_tokens != i for i in eos_token_id)).long()) 150 | 151 | output_token_ids = input_ids[0].cpu().tolist() 152 | output_token_ids = output_token_ids[input_length:] 153 | for each_eos_token_id in eos_token_id: 154 | if output_token_ids[-1] == each_eos_token_id: 155 | output_token_ids = output_token_ids[:-1] 156 | response = tokenizer.decode(output_token_ids) 157 | 158 | yield response 159 | # stop when each sentence is finished 160 | # or if we exceed the maximum length 161 | if unfinished_sequences.max() == 0 or stopping_criteria( 162 | input_ids, scores): 163 | break 164 | 165 | 166 | def on_btn_click(): 167 | del st.session_state.messages 168 | 169 | 170 | @st.cache_resource 171 | def load_model(arg1): 172 | # model = AutoModelForCausalLM.from_pretrained(args.m).cuda() 173 | # tokenizer = AutoTokenizer.from_pretrained(args.m, trust_remote_code=True) 174 | nf4_config = BitsAndBytesConfig(load_in_4bit=True, 175 | load_in_8bit=False, 176 | llm_int8_threshold=6.0, 177 | llm_int8_has_fp16_weight=False, 178 | bnb_4bit_compute_dtype=torch.float16, 179 | bnb_4bit_use_double_quant=True, 180 | bnb_4bit_quant_type='nf4') 181 | model = AutoModelForCausalLM.from_pretrained(arg1, 182 | device_map='auto', 183 | torch_dtype=torch.float16, 184 | trust_remote_code=True, 185 | quantization_config=nf4_config) 186 | tokenizer = AutoTokenizer.from_pretrained(arg1, trust_remote_code=True) 187 | 188 | 189 | return model, tokenizer 190 | 191 | 192 | def prepare_generation_config(): 193 | with st.sidebar: 194 | max_length = st.slider('Max Length', 195 | min_value=8, 196 | max_value=8192, 197 | value=8192) 198 | top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01) 199 | temperature = st.slider('Temperature', 0.0, 1.0, 0.7, step=0.01) 200 | st.button('Clear Chat History', on_click=on_btn_click) 201 | 202 | generation_config = GenerationConfig(max_length=max_length, 203 | top_p=top_p, 204 | temperature=temperature) 205 | 206 | return generation_config 207 | 208 | 209 | user_prompt = '<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|>' 210 | robot_prompt = '<|start_header_id|>assistant<|end_header_id|>\n\n{robot}<|eot_id|>' 211 | cur_query_prompt = '<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' 212 | 213 | 214 | def combine_history(prompt): 215 | messages = st.session_state.messages 216 | total_prompt = '' 217 | for message in messages: 218 | cur_content = message['content'] 219 | if message['role'] == 'user': 220 | cur_prompt = user_prompt.format(user=cur_content) 221 | elif message['role'] == 'robot': 222 | cur_prompt = robot_prompt.format(robot=cur_content) 223 | else: 224 | raise RuntimeError 225 | total_prompt += cur_prompt 226 | total_prompt = total_prompt + cur_query_prompt.format(user=prompt) 227 | return total_prompt 228 | 229 | 230 | def main(arg1): 231 | # torch.cuda.empty_cache() 232 | print('load model begin.') 233 | model, tokenizer = load_model(arg1) 234 | print('load model end.') 235 | 236 | st.title('Llama3-Instruct') 237 | 238 | generation_config = prepare_generation_config() 239 | 240 | # Initialize chat history 241 | if 'messages' not in st.session_state: 242 | st.session_state.messages = [] 243 | 244 | # Display chat messages from history on app rerun 245 | for message in st.session_state.messages: 246 | with st.chat_message(message['role']): 247 | st.markdown(message['content']) 248 | 249 | # Accept user input 250 | if prompt := st.chat_input('Hello!'): 251 | # Display user message in chat message container 252 | with st.chat_message('user'): 253 | st.markdown(prompt) 254 | real_prompt = combine_history(prompt) 255 | # Add user message to chat history 256 | st.session_state.messages.append({ 257 | 'role': 'user', 258 | 'content': prompt, 259 | }) 260 | 261 | with st.chat_message('robot'): 262 | message_placeholder = st.empty() 263 | for cur_response in generate_interactive( 264 | model=model, 265 | tokenizer=tokenizer, 266 | prompt=real_prompt, 267 | additional_eos_token_id=128009, # <|eot_id|> 268 | **asdict(generation_config), 269 | ): 270 | # Display robot response in chat message container 271 | message_placeholder.markdown(cur_response + '▌') 272 | message_placeholder.markdown(cur_response) 273 | # Add robot response to chat history 274 | st.session_state.messages.append({ 275 | 'role': 'robot', 276 | 'content': cur_response, # pylint: disable=undefined-loop-variable 277 | }) 278 | torch.cuda.empty_cache() 279 | 280 | 281 | if __name__ == '__main__': 282 | 283 | import sys 284 | arg1 = sys.argv[1] 285 | main(arg1) 286 | -------------------------------------------------------------------------------- /tools/internstudio_web_demo.py: -------------------------------------------------------------------------------- 1 | 2 | # isort: skip_file 3 | import copy 4 | import warnings 5 | from dataclasses import asdict, dataclass 6 | from typing import Callable, List, Optional 7 | 8 | import streamlit as st 9 | import torch 10 | from torch import nn 11 | from transformers.generation.utils import (LogitsProcessorList, 12 | StoppingCriteriaList) 13 | from transformers.utils import logging 14 | 15 | from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | import argparse 21 | 22 | 23 | 24 | @dataclass 25 | class GenerationConfig: 26 | # this config is used for chat to provide more diversity 27 | max_length: int = 32768 28 | top_p: float = 0.8 29 | temperature: float = 0.8 30 | do_sample: bool = True 31 | repetition_penalty: float = 1.005 32 | 33 | 34 | @torch.inference_mode() 35 | def generate_interactive( 36 | model, 37 | tokenizer, 38 | prompt, 39 | generation_config: Optional[GenerationConfig] = None, 40 | logits_processor: Optional[LogitsProcessorList] = None, 41 | stopping_criteria: Optional[StoppingCriteriaList] = None, 42 | prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], 43 | List[int]]] = None, 44 | additional_eos_token_id: Optional[int] = None, 45 | **kwargs, 46 | ): 47 | inputs = tokenizer([prompt], return_tensors='pt') 48 | input_length = len(inputs['input_ids'][0]) 49 | for k, v in inputs.items(): 50 | inputs[k] = v.cuda() 51 | input_ids = inputs['input_ids'] 52 | _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] 53 | if generation_config is None: 54 | generation_config = model.generation_config 55 | generation_config = copy.deepcopy(generation_config) 56 | model_kwargs = generation_config.update(**kwargs) 57 | bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612 58 | generation_config.bos_token_id, 59 | generation_config.eos_token_id, 60 | ) 61 | if isinstance(eos_token_id, int): 62 | eos_token_id = [eos_token_id] 63 | if additional_eos_token_id is not None: 64 | eos_token_id.append(additional_eos_token_id) 65 | has_default_max_length = kwargs.get( 66 | 'max_length') is None and generation_config.max_length is not None 67 | if has_default_max_length and generation_config.max_new_tokens is None: 68 | warnings.warn( 69 | f"Using 'max_length''s default ({repr(generation_config.max_length)}) \ 70 | to control the generation length. " 71 | 'This behaviour is deprecated and will be removed from the \ 72 | config in v5 of Transformers -- we' 73 | ' recommend using `max_new_tokens` to control the maximum \ 74 | length of the generation.', 75 | UserWarning, 76 | ) 77 | elif generation_config.max_new_tokens is not None: 78 | generation_config.max_length = generation_config.max_new_tokens + \ 79 | input_ids_seq_length 80 | if not has_default_max_length: 81 | logger.warn( # pylint: disable=W4902 82 | f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) " 83 | f"and 'max_length'(={generation_config.max_length}) seem to " 84 | "have been set. 'max_new_tokens' will take precedence. " 85 | 'Please refer to the documentation for more information. ' 86 | '(https://huggingface.co/docs/transformers/main/' 87 | 'en/main_classes/text_generation)', 88 | UserWarning, 89 | ) 90 | 91 | if input_ids_seq_length >= generation_config.max_length: 92 | input_ids_string = 'input_ids' 93 | logger.warning( 94 | f"Input length of {input_ids_string} is {input_ids_seq_length}, " 95 | f"but 'max_length' is set to {generation_config.max_length}. " 96 | 'This can lead to unexpected behavior. You should consider' 97 | " increasing 'max_new_tokens'.") 98 | 99 | # 2. Set generation parameters if not already defined 100 | logits_processor = logits_processor if logits_processor is not None \ 101 | else LogitsProcessorList() 102 | stopping_criteria = stopping_criteria if stopping_criteria is not None \ 103 | else StoppingCriteriaList() 104 | 105 | logits_processor = model._get_logits_processor( 106 | generation_config=generation_config, 107 | input_ids_seq_length=input_ids_seq_length, 108 | encoder_input_ids=input_ids, 109 | prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 110 | logits_processor=logits_processor, 111 | ) 112 | 113 | stopping_criteria = model._get_stopping_criteria( 114 | generation_config=generation_config, 115 | stopping_criteria=stopping_criteria) 116 | logits_warper = model._get_logits_warper(generation_config) 117 | 118 | unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) 119 | scores = None 120 | while True: 121 | model_inputs = model.prepare_inputs_for_generation( 122 | input_ids, **model_kwargs) 123 | # forward pass to get next token 124 | outputs = model( 125 | **model_inputs, 126 | return_dict=True, 127 | output_attentions=False, 128 | output_hidden_states=False, 129 | ) 130 | 131 | next_token_logits = outputs.logits[:, -1, :] 132 | 133 | # pre-process distribution 134 | next_token_scores = logits_processor(input_ids, next_token_logits) 135 | next_token_scores = logits_warper(input_ids, next_token_scores) 136 | 137 | # sample 138 | probs = nn.functional.softmax(next_token_scores, dim=-1) 139 | if generation_config.do_sample: 140 | next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) 141 | else: 142 | next_tokens = torch.argmax(probs, dim=-1) 143 | 144 | # update generated ids, model inputs, and length for next step 145 | input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) 146 | model_kwargs = model._update_model_kwargs_for_generation( 147 | outputs, model_kwargs, is_encoder_decoder=False) 148 | unfinished_sequences = unfinished_sequences.mul( 149 | (min(next_tokens != i for i in eos_token_id)).long()) 150 | 151 | output_token_ids = input_ids[0].cpu().tolist() 152 | output_token_ids = output_token_ids[input_length:] 153 | for each_eos_token_id in eos_token_id: 154 | if output_token_ids[-1] == each_eos_token_id: 155 | output_token_ids = output_token_ids[:-1] 156 | response = tokenizer.decode(output_token_ids) 157 | 158 | yield response 159 | # stop when each sentence is finished 160 | # or if we exceed the maximum length 161 | if unfinished_sequences.max() == 0 or stopping_criteria( 162 | input_ids, scores): 163 | break 164 | 165 | 166 | def on_btn_click(): 167 | del st.session_state.messages 168 | 169 | 170 | @st.cache_resource 171 | def load_model(arg1): 172 | # model = AutoModelForCausalLM.from_pretrained(args.m).cuda() 173 | # tokenizer = AutoTokenizer.from_pretrained(args.m, trust_remote_code=True) 174 | model = AutoModelForCausalLM.from_pretrained(arg1, torch_dtype=torch.float16).cuda() 175 | tokenizer = AutoTokenizer.from_pretrained(arg1, trust_remote_code=True) 176 | 177 | 178 | return model, tokenizer 179 | 180 | 181 | def prepare_generation_config(): 182 | with st.sidebar: 183 | max_length = st.slider('Max Length', 184 | min_value=8, 185 | max_value=8192, 186 | value=8192) 187 | top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01) 188 | temperature = st.slider('Temperature', 0.0, 1.0, 0.7, step=0.01) 189 | st.button('Clear Chat History', on_click=on_btn_click) 190 | 191 | generation_config = GenerationConfig(max_length=max_length, 192 | top_p=top_p, 193 | temperature=temperature) 194 | 195 | return generation_config 196 | 197 | 198 | user_prompt = '<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|>' 199 | robot_prompt = '<|start_header_id|>assistant<|end_header_id|>\n\n{robot}<|eot_id|>' 200 | cur_query_prompt = '<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' 201 | 202 | 203 | def combine_history(prompt): 204 | messages = st.session_state.messages 205 | total_prompt = '' 206 | for message in messages: 207 | cur_content = message['content'] 208 | if message['role'] == 'user': 209 | cur_prompt = user_prompt.format(user=cur_content) 210 | elif message['role'] == 'robot': 211 | cur_prompt = robot_prompt.format(robot=cur_content) 212 | else: 213 | raise RuntimeError 214 | total_prompt += cur_prompt 215 | total_prompt = total_prompt + cur_query_prompt.format(user=prompt) 216 | return total_prompt 217 | 218 | 219 | def main(arg1): 220 | # torch.cuda.empty_cache() 221 | print('load model begin.') 222 | model, tokenizer = load_model(arg1) 223 | print('load model end.') 224 | 225 | st.title('Llama3-Instruct') 226 | 227 | generation_config = prepare_generation_config() 228 | 229 | # Initialize chat history 230 | if 'messages' not in st.session_state: 231 | st.session_state.messages = [] 232 | 233 | # Display chat messages from history on app rerun 234 | for message in st.session_state.messages: 235 | with st.chat_message(message['role']): 236 | st.markdown(message['content']) 237 | 238 | # Accept user input 239 | if prompt := st.chat_input('Hello!'): 240 | # Display user message in chat message container 241 | with st.chat_message('user'): 242 | st.markdown(prompt) 243 | real_prompt = combine_history(prompt) 244 | # Add user message to chat history 245 | st.session_state.messages.append({ 246 | 'role': 'user', 247 | 'content': prompt, 248 | }) 249 | 250 | with st.chat_message('robot'): 251 | message_placeholder = st.empty() 252 | for cur_response in generate_interactive( 253 | model=model, 254 | tokenizer=tokenizer, 255 | prompt=real_prompt, 256 | additional_eos_token_id=128009, # <|eot_id|> 257 | **asdict(generation_config), 258 | ): 259 | # Display robot response in chat message container 260 | message_placeholder.markdown(cur_response + '▌') 261 | message_placeholder.markdown(cur_response) 262 | # Add robot response to chat history 263 | st.session_state.messages.append({ 264 | 'role': 'robot', 265 | 'content': cur_response, # pylint: disable=undefined-loop-variable 266 | }) 267 | torch.cuda.empty_cache() 268 | 269 | 270 | if __name__ == '__main__': 271 | 272 | import sys 273 | arg1 = sys.argv[1] 274 | main(arg1) 275 | -------------------------------------------------------------------------------- /web_demo.py: -------------------------------------------------------------------------------- 1 | 2 | # isort: skip_file 3 | import copy 4 | import warnings 5 | from dataclasses import asdict, dataclass 6 | from typing import Callable, List, Optional 7 | 8 | import streamlit as st 9 | import torch 10 | from torch import nn 11 | from transformers.generation.utils import (LogitsProcessorList, 12 | StoppingCriteriaList) 13 | from transformers.utils import logging 14 | 15 | from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | import argparse 21 | 22 | 23 | 24 | @dataclass 25 | class GenerationConfig: 26 | # this config is used for chat to provide more diversity 27 | max_length: int = 32768 28 | top_p: float = 0.8 29 | temperature: float = 0.8 30 | do_sample: bool = True 31 | repetition_penalty: float = 1.005 32 | 33 | 34 | @torch.inference_mode() 35 | def generate_interactive( 36 | model, 37 | tokenizer, 38 | prompt, 39 | generation_config: Optional[GenerationConfig] = None, 40 | logits_processor: Optional[LogitsProcessorList] = None, 41 | stopping_criteria: Optional[StoppingCriteriaList] = None, 42 | prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], 43 | List[int]]] = None, 44 | additional_eos_token_id: Optional[int] = None, 45 | **kwargs, 46 | ): 47 | inputs = tokenizer([prompt], return_tensors='pt') 48 | input_length = len(inputs['input_ids'][0]) 49 | for k, v in inputs.items(): 50 | inputs[k] = v.cuda() 51 | input_ids = inputs['input_ids'] 52 | _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] 53 | if generation_config is None: 54 | generation_config = model.generation_config 55 | generation_config = copy.deepcopy(generation_config) 56 | model_kwargs = generation_config.update(**kwargs) 57 | bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612 58 | generation_config.bos_token_id, 59 | generation_config.eos_token_id, 60 | ) 61 | if isinstance(eos_token_id, int): 62 | eos_token_id = [eos_token_id] 63 | if additional_eos_token_id is not None: 64 | eos_token_id.append(additional_eos_token_id) 65 | has_default_max_length = kwargs.get( 66 | 'max_length') is None and generation_config.max_length is not None 67 | if has_default_max_length and generation_config.max_new_tokens is None: 68 | warnings.warn( 69 | f"Using 'max_length''s default ({repr(generation_config.max_length)}) \ 70 | to control the generation length. " 71 | 'This behaviour is deprecated and will be removed from the \ 72 | config in v5 of Transformers -- we' 73 | ' recommend using `max_new_tokens` to control the maximum \ 74 | length of the generation.', 75 | UserWarning, 76 | ) 77 | elif generation_config.max_new_tokens is not None: 78 | generation_config.max_length = generation_config.max_new_tokens + \ 79 | input_ids_seq_length 80 | if not has_default_max_length: 81 | logger.warn( # pylint: disable=W4902 82 | f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) " 83 | f"and 'max_length'(={generation_config.max_length}) seem to " 84 | "have been set. 'max_new_tokens' will take precedence. " 85 | 'Please refer to the documentation for more information. ' 86 | '(https://huggingface.co/docs/transformers/main/' 87 | 'en/main_classes/text_generation)', 88 | UserWarning, 89 | ) 90 | 91 | if input_ids_seq_length >= generation_config.max_length: 92 | input_ids_string = 'input_ids' 93 | logger.warning( 94 | f"Input length of {input_ids_string} is {input_ids_seq_length}, " 95 | f"but 'max_length' is set to {generation_config.max_length}. " 96 | 'This can lead to unexpected behavior. You should consider' 97 | " increasing 'max_new_tokens'.") 98 | 99 | # 2. Set generation parameters if not already defined 100 | logits_processor = logits_processor if logits_processor is not None \ 101 | else LogitsProcessorList() 102 | stopping_criteria = stopping_criteria if stopping_criteria is not None \ 103 | else StoppingCriteriaList() 104 | 105 | logits_processor = model._get_logits_processor( 106 | generation_config=generation_config, 107 | input_ids_seq_length=input_ids_seq_length, 108 | encoder_input_ids=input_ids, 109 | prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 110 | logits_processor=logits_processor, 111 | ) 112 | 113 | stopping_criteria = model._get_stopping_criteria( 114 | generation_config=generation_config, 115 | stopping_criteria=stopping_criteria) 116 | logits_warper = model._get_logits_warper(generation_config) 117 | 118 | unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) 119 | scores = None 120 | while True: 121 | model_inputs = model.prepare_inputs_for_generation( 122 | input_ids, **model_kwargs) 123 | # forward pass to get next token 124 | outputs = model( 125 | **model_inputs, 126 | return_dict=True, 127 | output_attentions=False, 128 | output_hidden_states=False, 129 | ) 130 | 131 | next_token_logits = outputs.logits[:, -1, :] 132 | 133 | # pre-process distribution 134 | next_token_scores = logits_processor(input_ids, next_token_logits) 135 | next_token_scores = logits_warper(input_ids, next_token_scores) 136 | 137 | # sample 138 | probs = nn.functional.softmax(next_token_scores, dim=-1) 139 | if generation_config.do_sample: 140 | next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) 141 | else: 142 | next_tokens = torch.argmax(probs, dim=-1) 143 | 144 | # update generated ids, model inputs, and length for next step 145 | input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) 146 | model_kwargs = model._update_model_kwargs_for_generation( 147 | outputs, model_kwargs, is_encoder_decoder=False) 148 | unfinished_sequences = unfinished_sequences.mul( 149 | (min(next_tokens != i for i in eos_token_id)).long()) 150 | 151 | output_token_ids = input_ids[0].cpu().tolist() 152 | output_token_ids = output_token_ids[input_length:] 153 | for each_eos_token_id in eos_token_id: 154 | if output_token_ids[-1] == each_eos_token_id: 155 | output_token_ids = output_token_ids[:-1] 156 | response = tokenizer.decode(output_token_ids) 157 | 158 | yield response 159 | # stop when each sentence is finished 160 | # or if we exceed the maximum length 161 | if unfinished_sequences.max() == 0 or stopping_criteria( 162 | input_ids, scores): 163 | break 164 | 165 | 166 | def on_btn_click(): 167 | del st.session_state.messages 168 | 169 | 170 | @st.cache_resource 171 | def load_model(arg1): 172 | # model = AutoModelForCausalLM.from_pretrained(args.m).cuda() 173 | # tokenizer = AutoTokenizer.from_pretrained(args.m, trust_remote_code=True) 174 | model = AutoModelForCausalLM.from_pretrained(arg1, torch_dtype=torch.float16).cuda() 175 | tokenizer = AutoTokenizer.from_pretrained(arg1, trust_remote_code=True) 176 | 177 | 178 | return model, tokenizer 179 | 180 | 181 | def prepare_generation_config(): 182 | with st.sidebar: 183 | max_length = st.slider('Max Length', 184 | min_value=8, 185 | max_value=8192, 186 | value=8192) 187 | top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01) 188 | temperature = st.slider('Temperature', 0.0, 1.0, 0.7, step=0.01) 189 | st.button('Clear Chat History', on_click=on_btn_click) 190 | 191 | generation_config = GenerationConfig(max_length=max_length, 192 | top_p=top_p, 193 | temperature=temperature) 194 | 195 | return generation_config 196 | 197 | 198 | user_prompt = '<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|>' 199 | robot_prompt = '<|start_header_id|>assistant<|end_header_id|>\n\n{robot}<|eot_id|>' 200 | cur_query_prompt = '<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' 201 | 202 | 203 | def combine_history(prompt): 204 | messages = st.session_state.messages 205 | total_prompt = '' 206 | for message in messages: 207 | cur_content = message['content'] 208 | if message['role'] == 'user': 209 | cur_prompt = user_prompt.format(user=cur_content) 210 | elif message['role'] == 'robot': 211 | cur_prompt = robot_prompt.format(robot=cur_content) 212 | else: 213 | raise RuntimeError 214 | total_prompt += cur_prompt 215 | total_prompt = total_prompt + cur_query_prompt.format(user=prompt) 216 | return total_prompt 217 | 218 | 219 | def main(arg1): 220 | # torch.cuda.empty_cache() 221 | print('load model begin.') 222 | model, tokenizer = load_model(arg1) 223 | print('load model end.') 224 | 225 | st.title('Llama3-Instruct') 226 | 227 | generation_config = prepare_generation_config() 228 | 229 | # Initialize chat history 230 | if 'messages' not in st.session_state: 231 | st.session_state.messages = [] 232 | 233 | # Display chat messages from history on app rerun 234 | for message in st.session_state.messages: 235 | with st.chat_message(message['role']): 236 | st.markdown(message['content']) 237 | 238 | # Accept user input 239 | if prompt := st.chat_input('Hello!'): 240 | # Display user message in chat message container 241 | with st.chat_message('user'): 242 | st.markdown(prompt) 243 | real_prompt = combine_history(prompt) 244 | # Add user message to chat history 245 | st.session_state.messages.append({ 246 | 'role': 'user', 247 | 'content': prompt, 248 | }) 249 | 250 | with st.chat_message('robot'): 251 | message_placeholder = st.empty() 252 | for cur_response in generate_interactive( 253 | model=model, 254 | tokenizer=tokenizer, 255 | prompt=real_prompt, 256 | additional_eos_token_id=128009, # <|eot_id|> 257 | **asdict(generation_config), 258 | ): 259 | # Display robot response in chat message container 260 | message_placeholder.markdown(cur_response + '▌') 261 | message_placeholder.markdown(cur_response) 262 | # Add robot response to chat history 263 | st.session_state.messages.append({ 264 | 'role': 'robot', 265 | 'content': cur_response, # pylint: disable=undefined-loop-variable 266 | }) 267 | torch.cuda.empty_cache() 268 | 269 | 270 | if __name__ == '__main__': 271 | 272 | import sys 273 | arg1 = sys.argv[1] 274 | main(arg1) 275 | --------------------------------------------------------------------------------