├── data └── readme.txt ├── train ├── docs │ ├── interface.png │ ├── FAQ.md │ └── README_en.md ├── requirements.txt ├── configs │ ├── lora_config_bloom.json │ ├── lora_config_llama.json │ ├── deepspeed_config.json │ ├── deepspeed_config_flops_profile.json │ ├── deepspeed_config_stage3.json │ └── deepspeed_config_stage3_flops_profile.json ├── scripts │ ├── download_vicuna_until_success.sh │ ├── run_inference.sh │ ├── run_interface2.sh │ ├── find_libstdc++.sh │ ├── run_multi_backend.sh │ ├── merge_tokenizers.py │ ├── multinode_run.sh │ ├── convert_to_conv_data.py │ ├── run_sft.sh │ ├── run_pt.sh │ ├── run_multi_backend.py │ ├── debug.sh │ └── convert_llama_weights_to_hf.py ├── debug │ ├── templates │ │ ├── error.html │ │ ├── index.html │ │ └── output.html │ └── app.py ├── docker │ ├── upload_image.sh │ ├── build_dockerfile_upon_transfermers.sh │ ├── README.md │ ├── belle.dockerfile │ ├── docker_run.sh │ └── transformers.dockerfile ├── dschat_train_v1 │ ├── training_scripts │ │ ├── single_node │ │ │ ├── run_FT.sh │ │ │ └── run_LoRA.sh │ │ ├── single_gpu │ │ │ ├── run_FT.sh │ │ │ └── run_LoRA.sh │ │ ├── convert_to_conv_data.py │ │ └── convert_llama_weights_to_hf.py │ ├── utils │ │ ├── ds_utils.py │ │ ├── model │ │ │ └── model_utils.py │ │ ├── utils.py │ │ ├── module │ │ │ └── lora.py │ │ └── data │ │ │ └── raw_datasets.py │ ├── docker │ │ └── Dockerfile │ ├── FAQ.md │ ├── reproduce_our_papers │ │ └── Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md │ ├── prompt_eval.py │ └── README.md ├── src │ ├── merge_llama_with_lora.py │ ├── entry_point │ │ ├── inference.py │ │ ├── interface.py │ │ └── evaluation.ipynb │ ├── utils.py │ ├── sample_generator.py │ ├── callbacks.py │ ├── apply_delta.py │ └── trainer.py └── reproduce_our_papers │ └── Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md ├── requirements.txt └── README.md /data/readme.txt: -------------------------------------------------------------------------------- 1 | Training data for CALM-7B -------------------------------------------------------------------------------- /train/docs/interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dai-shen/CALM-train/HEAD/train/docs/interface.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | rouge_score 3 | fire 4 | openai 5 | transformers 6 | gensim 7 | peft 8 | datasets 9 | bitsandbytes 10 | deepspeed 11 | -------------------------------------------------------------------------------- /train/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | bitsandbytes 3 | datasets 4 | fire 5 | huggingface_hub 6 | torch 7 | tqdm 8 | transformers 9 | deepspeed 10 | gradio 11 | xformers -------------------------------------------------------------------------------- /train/configs/lora_config_bloom.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_r": 16, 3 | "lora_alpha": 32, 4 | "lora_dropout": 0.05, 5 | "lora_target_modules": [ 6 | "query_key_value" 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /train/scripts/download_vicuna_until_success.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while true; do 4 | python src/apply_delta.py --base 'decapoda-research/llama-7b-hf' --target './weights/vicuna-7b' --delta 'lmsys/vicuna-7b-delta-v1.1' && break 5 | sleep 1 6 | done 7 | -------------------------------------------------------------------------------- /train/configs/lora_config_llama.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_r": 16, 3 | "lora_alpha": 32, 4 | "lora_dropout": 0.05, 5 | "lora_target_modules": [ 6 | "q_proj", 7 | "k_proj", 8 | "v_proj", 9 | "o_proj", 10 | "down_proj", 11 | "gate_proj", 12 | "up_proj" 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /train/scripts/run_inference.sh: -------------------------------------------------------------------------------- 1 | model_name_or_path=/media/data1/fengduanyu/llama-2-7b-chat-T/ 2 | ckpt_path=/home/daiyf/daiyf/PIXIU-train/CRA__model_2/model_14956 3 | 4 | 5 | CUDA_VISIBLE_DEVICES=2 python src/entry_point/inference.py \ 6 | --model_name_or_path $model_name_or_path \ 7 | --ckpt_path $ckpt_path \ 8 | --llama 9 | 10 | -------------------------------------------------------------------------------- /train/scripts/run_interface2.sh: -------------------------------------------------------------------------------- 1 | model_name_or_path=/media/data1/fengduanyu/llama-2-7b-chat-T/ 2 | ckpt_path=/home/daiyf/daiyf/PIXIU-train/CRA__model_2/model_14956 3 | 4 | 5 | CUDA_VISIBLE_DEVICES=1 python src/entry_point/interface.py \ 6 | --model_name_or_path $model_name_or_path \ 7 | --ckpt_path $ckpt_path \ 8 | --llama 9 | 10 | -------------------------------------------------------------------------------- /train/scripts/find_libstdc++.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | files=$(find / -name 'libstdc++.so.6*' 2>/dev/null) 3 | total=$(echo "$files" | wc -l) 4 | counter=1 5 | 6 | echo "$total files found." 7 | 8 | for file in $files; do 9 | echo "Processing file $counter/$total: $file" 10 | if strings "$file" 2>/dev/null | grep -q 'CXXABI_1.3.9'; then 11 | echo "File matches: $file" 12 | fi 13 | ((counter++)) 14 | done 15 | -------------------------------------------------------------------------------- /train/debug/templates/error.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Error 5 | 16 | 17 | 18 |

Error

19 |

{{ message }}

20 | 21 | -------------------------------------------------------------------------------- /train/docker/upload_image.sh: -------------------------------------------------------------------------------- 1 | export https_proxy=http://127.0.0.1:65530 2 | export http_proxy=http://127.0.0.1:65530 3 | export all_proxy=socks5://127.0.0.1:65530 4 | docker_user=tothemoon 5 | 6 | # docker tag transformers:ds $docker_user/transformers:ds_$(date +%Y%m%d) 7 | # docker push $docker_user/transformers:ds_$(date +%Y%m%d) 8 | docker tag belle:$(date +%Y%m%d) $docker_user/belle:$(date +%Y%m%d) 9 | docker push $docker_user/belle:$(date +%Y%m%d) -------------------------------------------------------------------------------- /train/scripts/run_multi_backend.sh: -------------------------------------------------------------------------------- 1 | model_name_or_path='...' 2 | ckpt_path='...' 3 | export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | export MKL_SERVICE_FORCE_INTEL='1' 5 | 6 | # python scripts/run_multi_backend.py \ 7 | # --command "python ../src/entry_point/interface.py --model_name_or_path $model_name_or_path --ckpt_path $ckpt_path" 8 | 9 | python scripts/run_multi_backend.py \ 10 | --command "python ../src/entry_point/interface.py --model_name_or_path $model_name_or_path --ckpt_path $ckpt_path --use_lora" 11 | -------------------------------------------------------------------------------- /train/docker/build_dockerfile_upon_transfermers.sh: -------------------------------------------------------------------------------- 1 | export https_proxy=http://127.0.0.1:65530 2 | export http_proxy=http://127.0.0.1:65530 3 | export all_proxy=socks5://127.0.0.1:65530 4 | 5 | wget https://raw.githubusercontent.com/huggingface/transformers/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile -O transformers.dockerfile 6 | docker build --network host --build-arg HTTP_PROXY=$http_proxy -t transformers:ds -f transformers.dockerfile . 7 | docker build --network host --build-arg HTTP_PROXY=$http_proxy -t belle:$(date +%Y%m%d) -f belle.dockerfile . 8 | -------------------------------------------------------------------------------- /train/dschat_train_v1/training_scripts/single_node/run_FT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | 9 | rm -rf output/ 10 | mkdir -p $OUTPUT 11 | echo $OUTPUT 12 | echo $ZERO_STAGE 13 | data_output_path=$OUTPUT/data_files 14 | 15 | #BelleGroup/BELLE-7B-2M 16 | #/workspace/model_name_or_path/hf_llama_7b 17 | 18 | deepspeed main.py \ 19 | --sft_only_data_path belleMath.json \ 20 | --model_name_or_path /nfs/v100-022/pretrained_ckpt/hf_llama_7b/ \ 21 | --per_device_train_batch_size 2 \ 22 | --per_device_eval_batch_size 1 \ 23 | --max_seq_len 1024 \ 24 | --learning_rate 5e-6 \ 25 | --weight_decay 0.0001 \ 26 | --num_train_epochs 2 \ 27 | --gradient_accumulation_steps 4 \ 28 | --lr_scheduler_type cosine \ 29 | --num_warmup_steps 100 \ 30 | --seed 1234 \ 31 | --gradient_checkpointing \ 32 | --zero_stage $ZERO_STAGE \ 33 | --deepspeed \ 34 | --output_dir $OUTPUT \ 35 | --data_output_path $data_output_path \ 36 | # &> $OUTPUT/training.log 37 | -------------------------------------------------------------------------------- /train/dschat_train_v1/training_scripts/single_gpu/run_FT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | OUTPUT=$1 7 | ZERO_STAGE=$2 8 | 9 | rm -rf output/ 10 | rm -rf $OUTPUT 11 | mkdir -p $OUTPUT 12 | echo $OUTPUT 13 | echo $ZERO_STAGE 14 | data_output_path=$OUTPUT/data_files 15 | #bigscience/bloomz-1b7 16 | #facebook/opt-1.3b 17 | #bigscience/bloomz-560m 18 | 19 | deepspeed --num_gpus 1 main.py \ 20 | --sft_only_data_path belleMath.json \ 21 | --model_name_or_path bigscience/bloomz-560m \ 22 | --per_device_train_batch_size 8 \ 23 | --per_device_eval_batch_size 2 \ 24 | --max_seq_len 1024 \ 25 | --learning_rate 5e-5 \ 26 | --weight_decay 0.0001 \ 27 | --num_train_epochs 3 \ 28 | --gradient_accumulation_steps 8 \ 29 | --lr_scheduler_type cosine \ 30 | --num_warmup_steps 100 \ 31 | --seed 1234 \ 32 | --gradient_checkpointing \ 33 | --zero_stage $ZERO_STAGE \ 34 | --deepspeed \ 35 | --output_dir $OUTPUT \ 36 | --data_output_path $data_output_path \ 37 | # &> $OUTPUT/training.log 38 | -------------------------------------------------------------------------------- /train/scripts/merge_tokenizers.py: -------------------------------------------------------------------------------- 1 | ''' Train tokenizer''' 2 | import sentencepiece as spm 3 | 4 | spm.SentencePieceTrainer.train(input='/path/to/input_text', 5 | model_prefix='belle', 6 | model_type='bpe', 7 | vocab_size=25000, 8 | character_coverage=0.9995) 9 | 10 | ''' Merge tokenizer ''' 11 | import sentencepiece_model_pb2 as model 12 | orig_model_path = '/path/to/llama/tokenizer.model' 13 | belle_model_path = '/path/to/belle/belle.model' 14 | orig_m = model.ModelProto() 15 | belle_m = model.ModelProto() 16 | orig_m.ParseFromString(open(orig_model_path, "rb").read()) 17 | belle_m.ParseFromString(open(belle_model_path, "rb").read()) 18 | print(len(orig_m.pieces), len(belle_m.pieces)) 19 | orig_pieces = [] 20 | for piece in orig_m.pieces: 21 | orig_pieces.append(piece.piece) 22 | for piece in belle_m.pieces: 23 | if piece.piece not in orig_pieces: 24 | orig_m.pieces.append(piece) 25 | orig_pieces.append(piece.piece) 26 | 27 | print(len(orig_m.pieces)) 28 | save_vocab_path = '/path/to/merge_tokenizer/tokenizer.model' 29 | with open(save_vocab_path, 'wb') as f: 30 | f.write(orig_m.SerializeToString()) -------------------------------------------------------------------------------- /train/scripts/multinode_run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | node_rank=$1 4 | echo ${node_rank} 5 | 6 | model_name_or_path=/path_to_llm/hf_llama_7b/ # or bloomz-7b1-mt 7 | 8 | train_file=belleMath.json 9 | validation_file=belleMath-dev1K.json 10 | output_dir=saved_models 11 | mkdir -p ${output_dir} 12 | 13 | cache_dir=hf_cache_dir 14 | mkdir -p ${cache_dir} 15 | cutoff_len=1024 16 | 17 | master_addr="10.111.112.223" 18 | 19 | # #Multi-node 20 | torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_port 14545 --node_rank ${node_rank} src/entry_point/sft_train.py \ 21 | --model_name_or_path ${model_name_or_path} \ 22 | --llama \ 23 | --deepspeed configs/deepspeed_config.json \ 24 | --train_file ${train_file} \ 25 | --validation_file ${validation_file} \ 26 | --per_device_train_batch_size 2 \ 27 | --per_device_eval_batch_size 1 \ 28 | --gradient_accumulation_steps 1 \ 29 | --num_train_epochs 2 \ 30 | --model_max_length ${cutoff_len} \ 31 | --save_strategy "steps" \ 32 | --save_total_limit 3 \ 33 | --learning_rate 8e-6 \ 34 | --weight_decay 0.00001 \ 35 | --warmup_ratio 0.05 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 10 \ 38 | --evaluation_strategy "steps" \ 39 | --fp16 \ 40 | --seed 1234 \ 41 | --gradient_checkpointing \ 42 | --cache_dir ${cache_dir} \ 43 | --output_dir ${output_dir} 44 | -------------------------------------------------------------------------------- /train/configs/deepspeed_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 2e8, 6 | "reduce_scatter": true, 7 | "reduce_bucket_size": "auto", 8 | "overlap_comm": true, 9 | "contiguous_gradients": true 10 | }, 11 | "scheduler": { 12 | "type": "WarmupDecayLR", 13 | "params": { 14 | "warmup_min_lr": "auto", 15 | "warmup_max_lr": "auto", 16 | "warmup_num_steps": "auto", 17 | "total_num_steps": "auto" 18 | } 19 | }, 20 | "fp16": { 21 | "enabled": true, 22 | "auto_cast": false, 23 | "loss_scale": 0, 24 | "loss_scale_window": 1000, 25 | "initial_scale_power": 32, 26 | "hysteresis": 2, 27 | "min_loss_scale": 1 28 | }, 29 | "bf16": { 30 | "enabled": false 31 | }, 32 | "optimizer": { 33 | "type": "AdamW", 34 | "params": { 35 | "lr": "auto", 36 | "weight_decay": "auto", 37 | "betas": "auto", 38 | "eps": "auto", 39 | "torch_adam": true, 40 | "adam_w_mode": true 41 | } 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "steps_per_print": 1000, 46 | "train_batch_size": "auto", 47 | "train_micro_batch_size_per_gpu": "auto", 48 | "wall_clock_breakdown": false 49 | } 50 | -------------------------------------------------------------------------------- /train/docker/README.md: -------------------------------------------------------------------------------- 1 | # docker镜像 2 | ## 国内环境自行替换代理 3 | ```bash 4 | export https_proxy=http://127.0.0.1:65530 5 | export http_proxy=http://127.0.0.1:65530 6 | export all_proxy=socks5://127.0.0.1:65530 7 | ``` 8 | 9 | ## 构建镜像 10 | ```bash 11 | sudo bash build_dockerfile_upon_transfermers.sh 12 | ``` 13 | 14 | ## 上传镜像到dockerhub 15 | ```bash 16 | sudo bash upload_image.sh 17 | ``` 18 | 19 | ## 下载镜像 20 | 已经构建好镜像,无需自行构建 21 | ```bash 22 | sudo docker tothemoon/belle:20230728 23 | ``` 24 | belle镜像中包含sshd,可以远程连接到容器内部 25 | 26 | ## 运行镜像 27 | ### 1. 参考[nvidia安装说明](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)安装container-toolkit 28 | ### 2. 自行创建ssh密钥 29 | ### 3. 运行容器 30 | ```bash 31 | sudo bash docker_run.sh 32 | ``` 33 | `docker_run.sh`文件主要内容如下 34 | ``` 35 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 36 | --network host \ 37 | [--env https_proxy=$https_proxy \] 38 | [--env http_proxy=$http_proxy \] 39 | [--env all_proxy=$all_proxy \] 40 | --env HF_HOME=$hf_home \ 41 | -it [--rm] \ 42 | --name belle \ 43 | -v $belle_path:$belle_path \ 44 | -v $hf_home:$hf_home \ 45 | -v $ssh_pub_key:/root/.ssh/authorized_keys \ 46 | -w $workdir \ 47 | $docker_user/belle:$tag \ 48 | [--sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"] 49 | ``` 50 | `[]`中内容可忽略 51 | - `--rm`:容器退出时销毁,如果长期在容器中工作,可忽略 52 | - `--sshd_port`:sshd监听端口,默认是22001 53 | - `--cmd`:容器要执行的命令`"echo 'Hello, world!' && /bin/bash"`,可忽略 54 | - `hf_home`:huggingface缓存目录 55 | - `$ssh_pub_key`:sshd公钥目录 -------------------------------------------------------------------------------- /train/dschat_train_v1/training_scripts/single_node/run_LoRA.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | #facebook/opt-1.3b 7 | # Note that usually LoRA needs to use larger learning rate 8 | OUTPUT_PATH=$1 9 | ZERO_STAGE=$2 10 | echo $OUTPUT_PATH 11 | echo $ZERO_STAGE 12 | rm -rf output/ 13 | mkdir -p $OUTPUT_PATH 14 | 15 | 16 | # model_name_or_path=/workspace/model_name_or_path/hf_llama_7b 17 | # lora_module_name="q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj" 18 | # If the model is Bloom, lora_module_name should be 19 | model_name_or_path=/workspace/model_name_or_path/bloomz-7b1-mt 20 | lora_module_name="query_key_value,mlp" 21 | 22 | echo ${lora_module_name} 23 | 24 | deepspeed main.py \ 25 | --sft_only_data_path belleMath.json \ 26 | --eval_data_file belleMath-dev1K.json \ 27 | --data_split 10,0,0 \ 28 | --model_name_or_path ${model_name_or_path} \ 29 | --per_device_train_batch_size 4 \ 30 | --per_device_eval_batch_size 1 \ 31 | --max_seq_len 512 \ 32 | --learning_rate 3e-4 \ 33 | --weight_decay 0. \ 34 | --num_train_epochs 5 \ 35 | --gradient_accumulation_steps 1 \ 36 | --lr_scheduler_type cosine \ 37 | --num_warmup_steps 100 \ 38 | --seed 1234 \ 39 | --gradient_checkpointing \ 40 | --zero_stage $ZERO_STAGE \ 41 | --lora_dim 16 \ 42 | --lora_alpha 16 \ 43 | --lora_droppout 0.05 \ 44 | --lora_module_name ${lora_module_name} \ 45 | --deepspeed \ 46 | --output_dir $OUTPUT_PATH \ 47 | # &> $OUTPUT_PATH/training.log 48 | -------------------------------------------------------------------------------- /train/src/merge_llama_with_lora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from peft import PeftModel 5 | from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer 6 | 7 | # Ref: https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py 8 | 9 | def apply_lora(model_name_or_path, output_path, lora_path): 10 | print(f"Loading the base model from {model_name_or_path}") 11 | base = AutoModelForCausalLM.from_pretrained( 12 | model_name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 13 | ) 14 | if args.llama: 15 | base_tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path) 16 | else: 17 | base_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 18 | 19 | print(f"Loading the LoRA adapter from {lora_path}") 20 | 21 | lora_model = PeftModel.from_pretrained( 22 | base, 23 | lora_path, 24 | torch_dtype=torch.float16, 25 | ) 26 | 27 | print("Applying the LoRA") 28 | model = lora_model.merge_and_unload() 29 | 30 | print(f"Saving the target model to {output_path}") 31 | model.save_pretrained(output_path) 32 | base_tokenizer.save_pretrained(output_path) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--model_name_or_path", type=str, required=True) 38 | parser.add_argument("--output_path", type=str, required=True) 39 | parser.add_argument("--lora_path", type=str, required=True) 40 | parser.add_argument("--llama", action="store_true", required=True) 41 | 42 | args = parser.parse_args() 43 | 44 | apply_lora(args.model_name_or_path, args.output_path, args.lora_path) 45 | -------------------------------------------------------------------------------- /train/dschat_train_v1/training_scripts/single_gpu/run_LoRA.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | #facebook/opt-1.3b 7 | # Note that usually LoRA needs to use larger learning rate 8 | #/nfs/v100-022/jiyunjie/anaconda3/envs/llamalora/ 9 | #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nfs/v100-022/jiyunjie/anaconda3/envs/llamalora/lib/ 10 | 11 | OUTPUT_PATH=$1 12 | ZERO_STAGE=$2 13 | 14 | rm -rf output/ 15 | mkdir -p $OUTPUT_PATH 16 | echo $OUTPUT_PATH 17 | echo $ZERO_STAGE 18 | 19 | model_name_or_path=/workspace/model_name_or_path/hf_llama_7b 20 | lora_module_name="q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj" 21 | # If the model is Bloom, lora_module_name should be 22 | # model_name_or_path=/workspace/model_name_or_path/bloomz-7b1-mt 23 | # lora_module_name="query_key_value,mlp" 24 | 25 | echo ${lora_module_name} 26 | echo ${model_name_or_path} 27 | 28 | deepspeed --num_gpus 1 main.py \ 29 | --sft_only_data_path belleMath.json \ 30 | --data_split 10,0,0 \ 31 | --model_name_or_path ${model_name_or_path} \ 32 | --per_device_train_batch_size 2 \ 33 | --per_device_eval_batch_size 1 \ 34 | --max_seq_len 1024 \ 35 | --learning_rate 3e-4 \ 36 | --weight_decay 0. \ 37 | --num_train_epochs 5 \ 38 | --gradient_accumulation_steps 1 \ 39 | --lr_scheduler_type cosine \ 40 | --num_warmup_steps 100 \ 41 | --seed 1234 \ 42 | --gradient_checkpointing \ 43 | --zero_stage $ZERO_STAGE \ 44 | --lora_dim 16 \ 45 | --lora_alpha 16 \ 46 | --lora_droppout 0.05 \ 47 | --lora_module_name ${lora_module_name} \ 48 | --deepspeed \ 49 | --output_dir $OUTPUT_PATH \ 50 | # &> $OUTPUT_PATH/training.log 51 | -------------------------------------------------------------------------------- /train/scripts/convert_to_conv_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import datetime 4 | import os 5 | 6 | ''' 7 | orig_data: {"instruction": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?", "input": "", "output": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"} 8 | convert: { 9 | "id": xxx, 10 | "conversations":[ 11 | {"from": "human", "value": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?"}, 12 | {"from": "assistant", "value": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"}, 13 | ] 14 | } 15 | ''' 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument( 20 | "--orig_data", 21 | ) 22 | parser.add_argument( 23 | "--write_data", 24 | ) 25 | parser.add_argument( 26 | "--dataset_name", 27 | ) 28 | args = parser.parse_args() 29 | f_write = open(args.write_data,"w") 30 | with open(args.orig_data) as f: 31 | lines = f.readlines() 32 | num_id = 1 33 | for line in lines: 34 | data = json.loads(line) 35 | conversations = [{"from": "human", "value": data['instruction']+data['input']},{"from": "assistant", "value": data['output']}] 36 | # conversations = [{"from": "human", "value": data['input']},{"from": "assistant", "value": data['target']}] 37 | uniq_id = data['id'] if "id" in data else args.dataset_name+"-"+str(num_id) 38 | item = {"id":uniq_id, "conversations": conversations} 39 | f_write.write(json.dumps(item, ensure_ascii=False)+"\n") 40 | num_id += 1 41 | f_write.close() 42 | 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /train/configs/deepspeed_config_flops_profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "flops_profiler": { 3 | "enabled": true, 4 | "profile_step": 50, 5 | "module_depth": -1, 6 | "top_modules": 1, 7 | "detailed": true, 8 | "output_file": "profile.log" 9 | }, 10 | "zero_optimization": { 11 | "stage": 2, 12 | "allgather_partitions": true, 13 | "allgather_bucket_size": 2e8, 14 | "reduce_scatter": true, 15 | "reduce_bucket_size": "auto", 16 | "overlap_comm": true, 17 | "contiguous_gradients": true 18 | }, 19 | "scheduler": { 20 | "type": "WarmupDecayLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto", 25 | "total_num_steps": "auto" 26 | } 27 | }, 28 | "fp16": { 29 | "enabled": false, 30 | "auto_cast": false, 31 | "loss_scale": 0, 32 | "loss_scale_window": 1000, 33 | "initial_scale_power": 32, 34 | "hysteresis": 2, 35 | "min_loss_scale": 1 36 | }, 37 | "bf16": { 38 | "enabled": true 39 | }, 40 | "optimizer": { 41 | "type": "AdamW", 42 | "params": { 43 | "lr": "auto", 44 | "weight_decay": "auto", 45 | "betas": "auto", 46 | "eps": "auto", 47 | "torch_adam": true, 48 | "adam_w_mode": true 49 | } 50 | }, 51 | "gradient_accumulation_steps": "auto", 52 | "gradient_clipping": "auto", 53 | "steps_per_print": 1000, 54 | "train_batch_size": "auto", 55 | "train_micro_batch_size_per_gpu": "auto", 56 | "wall_clock_breakdown": false 57 | } -------------------------------------------------------------------------------- /train/dschat_train_v1/training_scripts/convert_to_conv_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import datetime 4 | import os 5 | 6 | ''' 7 | orig_data: {"instruction": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?", "input": "", "output": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"} 8 | convert: { 9 | "id": xxx, 10 | "conversations":[ 11 | {"from": "human", "value": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?"}, 12 | {"from": "assistant", "value": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"}, 13 | ] 14 | } 15 | ''' 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument( 20 | "--orig_data", 21 | ) 22 | parser.add_argument( 23 | "--write_data", 24 | ) 25 | parser.add_argument( 26 | "--dataset_name", 27 | ) 28 | args = parser.parse_args() 29 | f_write = open(args.write_data,"w") 30 | with open(args.orig_data) as f: 31 | lines = f.readlines() 32 | num_id = 1 33 | for line in lines: 34 | data = json.loads(line) 35 | conversations = [{"from": "human", "value": data['instruction']+data['input']},{"from": "assistant", "value": data['output']}] 36 | # conversations = [{"from": "human", "value": data['input']},{"from": "assistant", "value": data['target']}] 37 | uniq_id = data['id'] if "id" in data else args.dataset_name+"-"+str(num_id) 38 | item = {"id":uniq_id, "conversations": conversations} 39 | f_write.write(json.dumps(item, ensure_ascii=False)+"\n") 40 | num_id += 1 41 | f_write.close() 42 | 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /train/configs/deepspeed_config_stage3.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 0, 7 | "reduce_bucket_size": "auto", 8 | "stage3_prefetch_bucket_size": "auto", 9 | "stage3_param_persistence_threshold": "auto", 10 | "stage3_max_live_parameters": 0, 11 | "stage3_max_reuse_distance": 0, 12 | "stage3_gather_16bit_weights_on_model_save": true, 13 | "offload_param": { 14 | "device": "cpu" 15 | }, 16 | "offload_optimizer": { 17 | "device": "cpu" 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "WarmupDecayLR", 22 | "params": { 23 | "warmup_min_lr": "auto", 24 | "warmup_max_lr": "auto", 25 | "warmup_num_steps": "auto", 26 | "total_num_steps": "auto" 27 | } 28 | }, 29 | "fp16": { 30 | "enabled": false, 31 | "auto_cast": false, 32 | "loss_scale": 0, 33 | "loss_scale_window": 1000, 34 | "initial_scale_power": 32, 35 | "hysteresis": 2, 36 | "min_loss_scale": 1 37 | }, 38 | "bf16": { 39 | "enabled": true 40 | }, 41 | "optimizer": { 42 | "type": "AdamW", 43 | "params": { 44 | "lr": "auto", 45 | "weight_decay": "auto", 46 | "betas": "auto", 47 | "eps": "auto", 48 | "torch_adam": true, 49 | "adam_w_mode": true 50 | } 51 | }, 52 | "gradient_accumulation_steps": "auto", 53 | "gradient_clipping": "auto", 54 | "steps_per_print": 1000, 55 | "train_batch_size": "auto", 56 | "train_micro_batch_size_per_gpu": "auto", 57 | "wall_clock_breakdown": false 58 | } 59 | -------------------------------------------------------------------------------- /train/configs/deepspeed_config_stage3_flops_profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "flops_profiler": { 3 | "enabled": true, 4 | "profile_step": 50, 5 | "module_depth": -1, 6 | "top_modules": 1, 7 | "detailed": true, 8 | "output_file": "profile.log" 9 | }, 10 | "zero_optimization": { 11 | "stage": 3, 12 | "overlap_comm": true, 13 | "contiguous_gradients": true, 14 | "sub_group_size": 0, 15 | "reduce_bucket_size": "auto", 16 | "stage3_prefetch_bucket_size": "auto", 17 | "stage3_param_persistence_threshold": "auto", 18 | "stage3_max_live_parameters": 0, 19 | "stage3_max_reuse_distance": 0, 20 | "stage3_gather_16bit_weights_on_model_save": true, 21 | "offload_param": { 22 | "device": "cpu" 23 | }, 24 | "offload_optimizer": { 25 | "device": "cpu" 26 | } 27 | }, 28 | "scheduler": { 29 | "type": "WarmupDecayLR", 30 | "params": { 31 | "warmup_min_lr": "auto", 32 | "warmup_max_lr": "auto", 33 | "warmup_num_steps": "auto", 34 | "total_num_steps": "auto" 35 | } 36 | }, 37 | "fp16": { 38 | "enabled": false, 39 | "auto_cast": false, 40 | "loss_scale": 0, 41 | "loss_scale_window": 1000, 42 | "initial_scale_power": 32, 43 | "hysteresis": 2, 44 | "min_loss_scale": 1 45 | }, 46 | "bf16": { 47 | "enabled": true 48 | }, 49 | "optimizer": { 50 | "type": "AdamW", 51 | "params": { 52 | "lr": "auto", 53 | "weight_decay": "auto", 54 | "betas": "auto", 55 | "eps": "auto", 56 | "torch_adam": true, 57 | "adam_w_mode": true 58 | } 59 | }, 60 | "gradient_accumulation_steps": "auto", 61 | "gradient_clipping": "auto", 62 | "steps_per_print": 1000, 63 | "train_batch_size": "auto", 64 | "train_micro_batch_size_per_gpu": "auto", 65 | "wall_clock_breakdown": false 66 | } -------------------------------------------------------------------------------- /train/docker/belle.dockerfile: -------------------------------------------------------------------------------- 1 | FROM transformers:ds 2 | LABEL maintainer="BELLE" 3 | WORKDIR /workspace 4 | 5 | RUN apt update 6 | RUN apt install -y htop 7 | RUN apt install -y screen 8 | RUN apt install -y openssh-server \ 9 | && sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config \ 10 | && sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config \ 11 | && sed -i "s/#PasswordAuthentication yes/PasswordAuthentication no/" /etc/ssh/sshd_config \ 12 | && mkdir -p /run/sshd 13 | 14 | RUN python3 -m pip install -U --no-cache-dir pip 15 | RUN python3 -m pip install -U --no-cache-dir peft 16 | RUN python3 -m pip install -U --no-cache-dir gradio 17 | RUN python3 -m pip install -U --no-cache-dir pudb 18 | RUN python3 -m pip install -U --no-cache-dir xformers 19 | RUN python3 -m pip install -U --no-cache-dir bitsandbytes 20 | RUN python3 -m pip install -U --no-build-isolation --no-cache-dir flash-attn 21 | 22 | RUN mkdir -p /scripts && echo -e '#!/bin/bash\n\ 23 | SSHD_PORT=22001\n\ 24 | CMD_TO_RUN=""\n\ 25 | while (( "$#" )); do\n\ 26 | case "$1" in\n\ 27 | --sshd_port)\n\ 28 | if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then\n\ 29 | SSHD_PORT=$2\n\ 30 | shift 2\n\ 31 | else\n\ 32 | echo "Error: Argument for $1 is missing" >&2\n\ 33 | exit 1\n\ 34 | fi\n\ 35 | ;;\n\ 36 | --cmd)\n\ 37 | if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then\n\ 38 | CMD_TO_RUN=$2\n\ 39 | shift 2\n\ 40 | else\n\ 41 | echo "Error: Argument for $1 is missing" >&2\n\ 42 | exit 1\n\ 43 | fi\n\ 44 | ;;\n\ 45 | -*|--*=) \n\ 46 | echo "Error: Unsupported flag $1" >&2\n\ 47 | exit 1\n\ 48 | ;;\n\ 49 | *) \n\ 50 | shift\n\ 51 | ;;\n\ 52 | esac\n\ 53 | done\n\ 54 | sed -i "s/#Port 22/Port $SSHD_PORT/" /etc/ssh/sshd_config\n\ 55 | /usr/sbin/sshd\n\ 56 | if [ -n "$CMD_TO_RUN" ]; then\n\ 57 | bash -c "$CMD_TO_RUN"\n\ 58 | else\n\ 59 | /bin/bash\n\ 60 | fi' > /scripts/startup.sh && chmod +x /scripts/startup.sh 61 | 62 | ENTRYPOINT ["/bin/bash", "/scripts/startup.sh"] 63 | -------------------------------------------------------------------------------- /train/scripts/run_sft.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # llama1-7b --> abs_path: 3 | # /home/daiyf/daiyf/HFmodel/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16 4 | # llama2-7b-chat --> abs_path: 5 | # /media/data1/fengduanyu/llama-2-7b-chat-T/ 6 | export CUDA_VISIBLE_DEVICES='0,1,2,3' # Full 7 | export WANDB_PROJECT=CRA-llama2-7b-chat 8 | export WANDB_RUN_ID=CRA_0.045M 9 | export WANDB_RESUME=allow 10 | export ABS_PATH="" 11 | export PYTHONPATH="path_to/train" 12 | export WANDB_SERVER_PORT=10086 13 | export WANDB_API_KEY=WANDB_API_KEY 14 | model_name_or_path="path_to/llama-2-7b-chat-T" 15 | 16 | train_file="path_to/CRA-resample-train4w.json" 17 | validation_file="path_to/CRA-resample-dev3k.json" 18 | output_dir="$ABS_PATH/saved_models2/${WANDB_PROJECT}_${WANDB_RUN_ID}" 19 | mkdir -p ${output_dir} 20 | 21 | cache_dir="path_to/hf_cache_dir_2" # 代表缓存数据处理过程的路径 22 | log_dir="/path_to/train_log_dir_2" 23 | mkdir -p ${cache_dir} 24 | mkdir -p ${log_dir} 25 | cutoff_len=2048 # 最长输入序列长度(LLaMA模型建议设置为1024以上,Bloom模型设置为512以上) 26 | echo ${log_dir} 27 | 28 | # LoRA without 8bit 29 | nohup torchrun --nproc_per_node 2 src/entry_point/sft_train.py \ 30 | --model_name_or_path ${model_name_or_path} \ 31 | --bf16 True \ 32 | --llama True \ 33 | --use_lora True \ 34 | --deepspeed configs/deepspeed_config_stage3.json \ 35 | --lora_config configs/lora_config_llama.json \ 36 | --train_file ${train_file} \ 37 | --validation_file ${validation_file} \ 38 | --per_device_train_batch_size 6 \ 39 | --per_device_eval_batch_size 6 \ 40 | --gradient_accumulation_steps 1 \ 41 | --num_train_epochs 5 \ 42 | --model_max_length ${cutoff_len} \ 43 | --save_strategy "steps" \ 44 | --save_total_limit 3 \ 45 | --learning_rate 3e-4 \ 46 | --weight_decay 0.00001 \ 47 | --warmup_ratio 0.01 \ 48 | --lr_scheduler_type "cosine" \ 49 | --logging_steps 10 \ 50 | --evaluation_strategy "steps" \ 51 | --seed 1234 \ 52 | --gradient_checkpointing \ 53 | --cache_dir ${cache_dir} \ 54 | --output_dir ${output_dir} \ 55 | --overwrite_output_dir \ 56 | > ${log_dir}/train.log 2>&1 & 57 | # --fp16 \ 58 | # --resume_from_checkpoint ... -------------------------------------------------------------------------------- /train/debug/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Text Generation 5 | 6 | 7 |
8 | Generate Form 9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | 29 |
30 |
31 | 32 |
33 | Inference Form 34 |
35 |
36 |
37 | 38 |
39 |
40 | 41 | 42 | -------------------------------------------------------------------------------- /train/dschat_train_v1/utils/ds_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | GLOBAL_BATCH_SIZE = 32 6 | MICRO_BATCH_SIZE = 4 7 | 8 | 9 | def get_train_ds_config(offload, 10 | stage=2, 11 | enable_hybrid_engine=False, 12 | inference_tp_size=1, 13 | release_inference_cache=False, 14 | pin_parameters=True, 15 | tp_gather_partition_size=8): 16 | 17 | device = "cpu" if offload else "none" 18 | zero_opt_dict = { 19 | "stage": stage, 20 | "offload_param": { 21 | "device": device 22 | }, 23 | "offload_optimizer": { 24 | "device": device 25 | }, 26 | "stage3_param_persistence_threshold": 1e4, 27 | "stage3_max_live_parameters": 3e7, 28 | "stage3_prefetch_bucket_size": 3e7, 29 | "memory_efficient_linear": False 30 | } 31 | return { 32 | "train_batch_size": GLOBAL_BATCH_SIZE, 33 | "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, 34 | "steps_per_print": 10, 35 | "zero_optimization": zero_opt_dict, 36 | "fp16": { 37 | "enabled": True, 38 | "loss_scale_window": 100 39 | }, 40 | "gradient_clipping": 1.0, 41 | "prescale_gradients": False, 42 | "wall_clock_breakdown": False, 43 | "hybrid_engine": { 44 | "enabled": enable_hybrid_engine, 45 | "inference_tp_size": inference_tp_size, 46 | "release_inference_cache": release_inference_cache, 47 | "pin_parameters": pin_parameters, 48 | "tp_gather_partition_size": tp_gather_partition_size, 49 | } 50 | } 51 | 52 | 53 | def get_eval_ds_config(offload, stage=0): 54 | device = "cpu" if offload else "none" 55 | zero_opt_dict = { 56 | "stage": stage, 57 | "stage3_param_persistence_threshold": 1e4, 58 | "offload_param": { 59 | "device": device 60 | }, 61 | "memory_efficient_linear": False 62 | } 63 | return { 64 | "train_batch_size": GLOBAL_BATCH_SIZE, 65 | "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, 66 | "steps_per_print": 10, 67 | "zero_optimization": zero_opt_dict, 68 | "fp16": { 69 | "enabled": True 70 | }, 71 | "gradient_clipping": 1.0, 72 | "prescale_gradients": False, 73 | "wall_clock_breakdown": False 74 | } 75 | -------------------------------------------------------------------------------- /train/docker/docker_run.sh: -------------------------------------------------------------------------------- 1 | # 需要先安装container-toolkit 2 | # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html 3 | 4 | export https_proxy=... 5 | export http_proxy=... 6 | export all_proxy=... 7 | 8 | belle_path=... 9 | docker_user=... 10 | tag=... 11 | hf_home="/.../.cache/huggingface" 12 | ssh_pub_key="/home/.../.ssh/id_rsa.pub" 13 | workdir="$belle_path/train" 14 | chown root:root $ssh_pub_key 15 | 16 | # docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 17 | # --network host \ 18 | # --env HTTP_PROXY=$http_proxy \ 19 | # --env HF_HOME=$hf_home \ 20 | # -it --rm \ 21 | # -v $belle_path:$belle_path \ 22 | # -v $hf_home:$hf_home \ 23 | # -v $ssh_pub_key:/root/.ssh/authorized_keys \ 24 | # -w $workdir \ 25 | # $docker_user/transformers:ds_$tag \ 26 | # /bin/bash 27 | 28 | # 前台运行 29 | # docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 30 | # --network host \ 31 | # --env https_proxy=$https_proxy \ 32 | # --env http_proxy=$http_proxy \ 33 | # --env all_proxy=$all_proxy \ 34 | # --env HF_HOME=$hf_home \ 35 | # -it --rm \ 36 | # --name belle \ 37 | # -v $belle_path:$belle_path \ 38 | # -v $hf_home:$hf_home \ 39 | # -v $ssh_pub_key:/root/.ssh/authorized_keys \ 40 | # -w $workdir \ 41 | # $docker_user/belle:$tag \ 42 | # --sshd_port 2201 --cmd "echo 'export https_proxy=$https_proxy' >> /root/.bashrc && \ 43 | # echo 'export http_proxy=$http_proxy' >> /root/.bashrc && \ 44 | # echo 'export all_proxy=$all_proxy' >> /root/.bashrc && \ 45 | # echo 'export HF_HOME=$hf_home' >> /root/.bashrc && \ 46 | # /bin/bash" 47 | 48 | # 后台运行 49 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 50 | --network host \ 51 | --env https_proxy=$https_proxy \ 52 | --env http_proxy=$http_proxy \ 53 | --env all_proxy=$all_proxy \ 54 | --env HF_HOME=$hf_home \ 55 | -d --rm \ 56 | --name belle \ 57 | -v $belle_path:$belle_path \ 58 | -v $hf_home:$hf_home \ 59 | -v $ssh_pub_key:/root/.ssh/authorized_keys \ 60 | -w $workdir \ 61 | $docker_user/belle:$tag \ 62 | --sshd_port 2201 --cmd "echo 'export https_proxy=$https_proxy' >> /root/.bashrc && \ 63 | echo 'export http_proxy=$http_proxy' >> /root/.bashrc && \ 64 | echo 'export all_proxy=$all_proxy' >> /root/.bashrc && \ 65 | echo 'export HF_HOME=$hf_home' >> /root/.bashrc && \ 66 | sleep infinity" -------------------------------------------------------------------------------- /train/src/entry_point/inference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoConfig 4 | from peft import PeftModel 5 | import argparse 6 | from tqdm import tqdm 7 | import json, os 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--model_name_or_path', type=str, required=True) 10 | parser.add_argument('--ckpt_path', type=str, required=True) 11 | parser.add_argument('--use_lora', action="store_true") 12 | parser.add_argument('--llama', action="store_true") 13 | args = parser.parse_args() 14 | 15 | 16 | max_new_tokens = 1024 17 | generation_config = dict( 18 | temperature=0.001, 19 | top_k=30, 20 | top_p=0.85, 21 | do_sample=True, 22 | num_beams=1, 23 | repetition_penalty=1.2, 24 | max_new_tokens=max_new_tokens 25 | ) 26 | 27 | instruction_list = [ 28 | "Human: \n小明有12个橙子,他想把它们分给他的4个朋友,每人分到的橙子数量相同,每人能分到几个橙子?\n\nAssistant:\n", 29 | "Human: \n以下是一道小学数学题:小明家里有 3 只宠物猫和 2 只宠物狗,小花家里有 4 只宠物猫和 1 只宠物狗,谁家里宠物更多?\n\nAssistant:\n", 30 | "Human: \n题目:小明有5个球,他送给小红2个球,还剩多少个球?\n\nAssistant:\n", 31 | "Human: \n请问2+3等于几?\n\nAssistant:\n" 32 | ] 33 | 34 | 35 | if __name__ == '__main__': 36 | load_type = torch.float16 #Sometimes may need torch.float32 37 | if torch.cuda.is_available(): 38 | device = torch.device(0) 39 | else: 40 | device = torch.device('cpu') 41 | 42 | if args.llama: 43 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path) 44 | else: 45 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 46 | 47 | tokenizer.pad_token_id = 0 48 | tokenizer.bos_token_id = 1 49 | tokenizer.eos_token_id = 2 50 | tokenizer.padding_side = "left" 51 | model_config = AutoConfig.from_pretrained(args.model_name_or_path) 52 | 53 | if args.use_lora: 54 | base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=load_type) 55 | model = PeftModel.from_pretrained(base_model, args.ckpt_path, torch_dtype=load_type) 56 | else: 57 | model = AutoModelForCausalLM.from_pretrained(args.ckpt_path, torch_dtype=load_type, config=model_config) 58 | 59 | if device==torch.device('cpu'): 60 | model.float() 61 | 62 | model.to(device) 63 | model.eval() 64 | print("Load model successfully") 65 | 66 | for instruction in instruction_list: 67 | inputs = tokenizer(instruction, max_length=max_new_tokens,truncation=True,return_tensors="pt") 68 | generation_output = model.generate( 69 | input_ids = inputs["input_ids"].to(device), 70 | **generation_config 71 | )[0] 72 | 73 | generate_text = tokenizer.decode(generation_output,skip_special_tokens=True) 74 | print(generate_text) 75 | print("-"*100) 76 | -------------------------------------------------------------------------------- /train/dschat_train_v1/utils/model/model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import math 7 | import torch 8 | from transformers import ( 9 | AutoConfig, 10 | AutoModel, 11 | ) 12 | 13 | from transformers.deepspeed import HfDeepSpeedConfig 14 | 15 | # from .reward_model import RewardModel 16 | 17 | 18 | def create_hf_model(model_class, 19 | model_name_or_path, 20 | tokenizer, 21 | ds_config=None, 22 | rlhf_training=False): 23 | model_config = AutoConfig.from_pretrained(model_name_or_path) 24 | model_config.dropout = 0.0 25 | # Note: dschf is defined in function scope to avoid global effects 26 | # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration 27 | if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3: 28 | dschf = HfDeepSpeedConfig(ds_config) 29 | else: 30 | dschf = None 31 | if rlhf_training: 32 | # the weight loading is handled by create critic model 33 | model = model_class.from_config(model_config) 34 | else: 35 | model = model_class.from_pretrained( 36 | model_name_or_path, 37 | from_tf=bool(".ckpt" in model_name_or_path), 38 | config=model_config) 39 | 40 | # model.config.end_token_id = tokenizer.eos_token_id 41 | # model.config.pad_token_id = model.config.eos_token_id 42 | # model.resize_token_embeddings(int(8 *math.ceil(len(tokenizer) / 8.0))) # make the vocab size multiple of 8 43 | return model 44 | 45 | 46 | # def create_critic_model(model_name_or_path, 47 | # tokenizer, 48 | # ds_config, 49 | # num_padding_at_beginning=0, 50 | # rlhf_training=False): 51 | # # OPT model family always put a padding token at the beginning of the sequence, 52 | # # we did not see this in other models but not sure if it is a general rule 53 | # critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer, 54 | # ds_config, rlhf_training) 55 | # critic_model = RewardModel( 56 | # critic_model, 57 | # tokenizer, 58 | # num_padding_at_beginning=num_padding_at_beginning) 59 | 60 | # if rlhf_training: 61 | # # critic model needs to load the weight here 62 | # model_ckpt_path = os.path.join(model_name_or_path, 'pytorch_model.bin') 63 | # assert os.path.exists( 64 | # model_ckpt_path 65 | # ), f"Cannot find model checkpoint at {model_ckpt_path}" 66 | # critic_model.load_state_dict( 67 | # torch.load(model_ckpt_path, map_location='cpu')) 68 | 69 | # return critic_model 70 | -------------------------------------------------------------------------------- /train/docker/transformers.dockerfile: -------------------------------------------------------------------------------- 1 | # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12 2 | FROM nvcr.io/nvidia/pytorch:22.12-py3 3 | LABEL maintainer="Hugging Face" 4 | 5 | ARG DEBIAN_FRONTEND=noninteractive 6 | 7 | ARG PYTORCH='2.0.1' 8 | # Example: `cu102`, `cu113`, etc. 9 | ARG CUDA='cu118' 10 | 11 | RUN apt -y update 12 | RUN apt install -y libaio-dev 13 | RUN python3 -m pip install --no-cache-dir --upgrade pip 14 | 15 | ARG REF=main 16 | RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF 17 | 18 | RUN python3 -m pip uninstall -y torch torchvision torchaudio 19 | 20 | # Install latest release PyTorch 21 | # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) 22 | # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) 23 | RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA 24 | 25 | RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] 26 | 27 | RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate 28 | 29 | # Uninstall `transformer-engine` shipped with the base image 30 | RUN python3 -m pip uninstall -y transformer-engine 31 | 32 | # Uninstall `torch-tensorrt` shipped with the base image 33 | RUN python3 -m pip uninstall -y torch-tensorrt 34 | 35 | # recompile apex 36 | RUN python3 -m pip uninstall -y apex 37 | RUN git clone https://github.com/NVIDIA/apex 38 | # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners 39 | RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . 40 | 41 | # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) 42 | RUN python3 -m pip uninstall -y deepspeed 43 | # This has to be run (again) inside the GPU VMs running the tests. 44 | # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. 45 | # TODO: Find out why test fail. 46 | RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 47 | 48 | # When installing in editable mode, `transformers` is not recognized as a package. 49 | # this line must be added in order for python to be aware of transformers. 50 | RUN cd transformers && python3 setup.py develop 51 | 52 | # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails 53 | RUN python3 -m pip install -U --no-cache-dir "pydantic<2" 54 | RUN python3 -c "from deepspeed.launcher.runner import main" 55 | -------------------------------------------------------------------------------- /train/dschat_train_v1/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08 2 | FROM nvcr.io/nvidia/pytorch:22.08-py3 3 | LABEL maintainer="BELLE" 4 | 5 | ARG DEBIAN_FRONTEND=noninteractive 6 | 7 | ARG PYTORCH='2.0.0' 8 | # Example: `cu102`, `cu113`, etc. 9 | ARG CUDA='cu117' 10 | 11 | RUN apt -y update 12 | RUN apt install -y libaio-dev 13 | RUN python3 -m pip install --no-cache-dir --upgrade pip 14 | RUN python3 -m pip install -U pip 15 | RUN python3 -m pip config set global.index-url http://mirrors.aliyun.com/pypi/simple 16 | RUN python3 -m pip config set install.trusted-host mirrors.aliyun.com 17 | 18 | ARG REF=main 19 | RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF 20 | 21 | # Install latest release PyTorch 22 | # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) 23 | # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) 24 | RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA 25 | 26 | RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] 27 | 28 | # Uninstall `torch-tensorrt` shipped with the base image 29 | RUN python3 -m pip uninstall -y torch-tensorrt 30 | 31 | # recompile apex 32 | RUN python3 -m pip uninstall -y apex 33 | RUN git clone https://github.com/NVIDIA/apex 34 | # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners 35 | RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . 36 | 37 | # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) 38 | RUN python3 -m pip uninstall -y deepspeed 39 | # This has to be run (again) inside the GPU VMs running the tests. 40 | # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. 41 | # TODO: Find out why test fail. 42 | RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 43 | 44 | # When installing in editable mode, `transformers` is not recognized as a package. 45 | # this line must be added in order for python to be aware of transformers. 46 | RUN cd transformers && python3 setup.py develop 47 | 48 | # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails 49 | RUN python3 -m pip install -U --no-cache-dir pydantic 50 | RUN python3 -c "from deepspeed.launcher.runner import main" 51 | 52 | #customer 53 | RUN python3 -m pip install datasets>=2.8.0 54 | RUN python3 -m pip install sentencepiece>=0.1.97 55 | RUN python3 -m pip install protobuf==3.20.3 56 | RUN python3 -m pip install accelerate>=0.15.0 57 | -------------------------------------------------------------------------------- /train/docs/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | 这里给出一些实验过程中遇到的问题以及供参考的解决方案,同时对问题大致地分类 4 | 5 | **解决方案仅供参考,未必能彻底解决对应问题!!!** 6 | 7 | **解决方案仅供参考,未必能彻底解决对应问题!!!** 8 | 9 | **解决方案仅供参考,未必能彻底解决对应问题!!!** 10 | 11 | ### Deepspeed相关 12 | 13 | | 报错信息 | 参考 | 14 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 15 | | RuntimeError: CUDA error: an illegal memory access was encountered.CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect | https://github.com/pytorch/pytorch/issues/21819 | 16 | | RuntimeError: Error building extension 'fused_adam' | sudo ln -s /usr/local/cuda/lib64/libcudart.so /usr/lib/libcudart.so | 17 | | RuntimeError: expected scalar type Float but found Half | use_int8_training和deepspeed不能同时指定 | 18 | | RuntimeError: expected scalar type Float but found Half | V100显卡上 use_int8_training和fp16不能同时指定 | 19 | 20 | ### transformers相关 21 | 22 | | 报错信息 | 参考 | 23 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 24 | | AutoTokenizer.from_pretrained("llama_model_path")出现递归error
RecursionError: maximum recursion depth exceeded | 有可能是transformers版本的问题,对于LlamaModel,可采用LlamaTokenizer加载 | 25 | | torch.distributed.distributed_c10d.init_process_group() got multiple values for keyword argument 'backend' | transformers降低版本至4.28.1 | 26 | | | | 27 | 28 | ### 其他问题 29 | 30 | | 报错信息 | 参考 | 31 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 32 | | V100机器上8bit量化训练失败或loss不正常 | https://github.com/Facico/Chinese-Vicuna/issues/39
https://github.com/TimDettmers/bitsandbytes/issues/100
https://github.com/mymusise/ChatGLM-Tuning/issues/19
https://github.com/tloen/alpaca-lora/issues/170 | 33 | | | | 34 | | huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': . Use `repo_type` argument if needed. | 这是因为docker容器内访问不到model_name_or_path,需要挂载到物理机对应的目录。 | 35 | 36 | 37 | 38 | 这里给出一些实验建议: 39 | 40 | 1. 不开deepspeed会占用更多显存,建议全量参数finetune模式尽可能采用deepspeed 41 | 2. LoRA训练如果采用8bit量化,就不能使用deepspeed;如果使用deepspeed,就不能指定use_int8_training 42 | 43 | 关于deepspeed的配置可参考: 44 | 45 | 1. https://github.com/microsoft/DeepSpeed/issues/2187 46 | 2. https://www.deepspeed.ai/tutorials/advanced-install/ 47 | 3. https://github.com/pyg-team/pytorch_geometric/issues/1001 48 | 49 | -------------------------------------------------------------------------------- /train/debug/templates/output.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Token Probabilities 5 | 6 | 71 | 72 | 73 | {% for item in tokens_with_probs %} 74 | 75 | {{ item.token }} 76 | {{ '%.2f'|format(item.prob*100) }}% 77 | 78 | {% endfor %} 79 | 80 |
81 |
82 | {% for i in range(11) %} 83 | {{ i / 10 }} 84 | {% endfor %} 85 |
86 |
87 | 88 |
89 | Loss: {{ loss }} 90 |
91 | 92 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /train/src/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Any, List, Union 3 | from gradio_client import Client 4 | from tqdm import tqdm 5 | from transformers.deepspeed import is_deepspeed_zero3_enabled 6 | from deepspeed.runtime.engine import DeepSpeedEngine 7 | import torch 8 | 9 | 10 | def get_ds_state_dict(ds_engine: DeepSpeedEngine): 11 | """ 12 | 如果是zero stage 3,要对所有rank调用,无视掉stage3_gather_16bit_weights_on_model_save参数 13 | """ 14 | if ds_engine.zero_optimization_partition_weights(): 15 | # consolidation is expensive in time and memory and therefore isn't a default 16 | state_dict = ds_engine._zero3_consolidated_16bit_state_dict() 17 | else: 18 | state_dict = ds_engine.module.state_dict() 19 | return state_dict 20 | 21 | 22 | def get_model_param_count(model: Union[DeepSpeedEngine, torch.nn.Module], trainable_only=False): 23 | """ 24 | Calculate model's total param count. If trainable_only is True then count only those requiring grads 25 | """ 26 | if is_deepspeed_zero3_enabled() and isinstance(model, DeepSpeedEngine): 27 | def numel(p): 28 | return p.ds_numel 29 | 30 | else: 31 | def numel(p): 32 | return p.numel() 33 | 34 | return sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad) 35 | 36 | 37 | class MultiClient(object): 38 | def __init__(self, worker_addrs) -> None: 39 | self.clients = [Client(addr) for addr in worker_addrs] 40 | 41 | def predict(self, tasks: List[List], max_retries: int = 3) -> List[Any]: 42 | pbar = tqdm(total=len(tasks)) 43 | jobs = { 44 | client: (i, client.submit(*(tasks[i]), api_name="/predict")) 45 | for i, client in enumerate(self.clients) 46 | if i < len(tasks) 47 | } 48 | results = {} 49 | retries = {i: 0 for i in range(len(tasks))} 50 | 51 | while jobs: 52 | for client, (i, job) in list(jobs.items()): 53 | if job.done(): 54 | pbar.update(1) 55 | del jobs[client] 56 | try: 57 | result = job.result() 58 | results[i] = result 59 | except Exception as e: 60 | print("Job failed with error:", e) 61 | if retries[i] < max_retries: 62 | print("Retrying job...") 63 | retries[i] += 1 64 | new_job = client.submit( 65 | *tasks[i], api_name="/predict") 66 | jobs[client] = (i, new_job) 67 | continue # Skip the rest of the loop 68 | else: 69 | results[i] = None 70 | 71 | if tasks: 72 | new_i = len(results) + len(jobs) 73 | if new_i < len(tasks): 74 | new_task = tasks[new_i] 75 | new_job = client.submit( 76 | *new_task, api_name="/predict") 77 | jobs[client] = (new_i, new_job) 78 | time.sleep(0.1) 79 | pbar.close() 80 | 81 | predicts = [results[i] for i in sorted(results)] 82 | 83 | return predicts 84 | -------------------------------------------------------------------------------- /train/dschat_train_v1/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | **我们的机器配置是8卡A100 40G,所以下面的问题也只针对8卡A100 40G的配置** 4 | 5 | LLaMA模型的max_seq_len通常设置为1024或2048 6 | 7 | Bloomz模型的max_seq_len通常设置为512或1024 8 | 9 | ## 1. 单机单卡可以训练多大参数量的模型 10 | 11 | 现在的deepspeed-chat项目还不支持offload,目前实验来看,在max_seq_len设置为1024的情况下,单机单卡(40G)上微调的模型参数量应该最多也就是bloomz-1b1。(总结的并不严谨,欢迎大家指正) 12 | 13 | 如果想要微调bloomz-1b7模型,可能需要将max_seq_len设置的很小。 14 | 15 | 16 | 17 | ## 2. 单机多卡可以训练多大参数量的模型 18 | 19 | 目前来看,可以跑起来7b1的bloomz和7B的LLaMA。 20 | 21 | 22 | 23 | ## 3. 单机单卡采用LoRA可以训练多大参数量的模型 24 | 25 | 可以训练7B的LLaMA。而对于7b1的bloom,需要将max_seq_len设置小一些。 26 | 27 | 28 | 29 | ## 4. 单机多卡采用LoRA可以训练多大参数量的模型 30 | 31 | 可以训练13B的模型。目前还没在8卡A100 40G上尝试过更大参数量的模型。欢迎大家一起交流。 32 | 33 | ## 5. 加载Llama tokenizer时存在的问题 34 | 35 | Llama的tokenizer初始没有pad_token_id,需要赋值 36 | 37 | 实验过程中发现,不同的transformers版本在加载Llama词表时会出现一些问题,记录如下: 38 | 39 | | transformers版本 | 问题 | 40 | | ---------------- | ------------------------------------------------------------ | 41 | | 4.28.0.dev0 | 当前版本可正常加载tokenizer,unk_token_id=0, bos_token_id=1, eos_token_id=2 | 42 | | 4.28.1 | AutoTokenizer.from_pretrained会出现RecursionError: maximum recursion depth exceeded,需要用LlamaTokenizer
eos_token_id,bos_token_id,unk_token_id都是0 | 43 | | 4.29.0.dev0 | 该版本与4.28.1存在同样的问题 | 44 | 45 | 目前统一解决办法是,如果模型是llama,则做如下赋值操作: 46 | 47 | ```bash 48 | tokenizer.pad_token_id = 0 49 | tokenizer.bos_token_id = 1 50 | tokenizer.eos_token_id = 2 51 | ``` 52 | 53 | ## 6. 加载2M的数据量需要多大的内存和多长时间 54 | 55 | 对于200万的数据量,通过观察,大概要350G的内存,加载时长大概在25min左右(这是Bloom的时长,如果是Llama,tokenize的时间会加长) 56 | 57 | 我们目前尚未对加载数据部分的代码做优化,包括内存和时长。 58 | 59 | ## 7. 训练模型的生成结果非常糟糕 60 | 61 | 这里的糟糕指的是生成的结果形如:“我们用减法计算出小明还剩下多少个鸡蛋多少个鸡蛋多少个鸡蛋多少个鸡蛋个鸡蛋个鸡蛋减法计算减法计算蛋个鸡蛋” 62 | 63 | 就目前我们的实验经验来看,出现这个问题的主要原因出在这几个特殊的token_id上,尤其是pad_token_id和eos_token_id的值,要确保两者不相等,而且pad_token_id=0, eos_token_id=2。(不区分LLaMA和Bloom) 64 | 65 | ## Others 66 | 67 | 这里我们提供了在实验过程中遇到的一些报错的情况,并提供了参考的解决方案。(**注:参考方案未必一定能够解决对应的问题**) 68 | 69 | | 报错信息 | 参考 | 70 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 71 | | RuntimeError: CUDA error: an illegal memory access was encountered.CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect | https://github.com/pytorch/pytorch/issues/21819 | 72 | | AutoTokenizer.from_pretrained("llama_model_path")出现递归error
RecursionError: maximum recursion depth exceeded | 有可能是transformers版本的问题,对于LlamaModel,可采用LlamaTokenizer加载 | 73 | | xx>=0.11.0 is required for a normal functioning of this module, but found xx==0.10.0 | 这是因为版本不匹配导致的问题,可按照报错信息安装要求的版本即可 | 74 | | torch.distributed.distributed_c10d.init_process_group() got multiple values for keyword argument 'backend' | transformers降低版本至4.28.1 | 75 | | RuntimeError: Error building extension 'fused_adam' | sudo ln -s /usr/local/cuda/lib64/libcudart.so /usr/lib/libcudart.so | 76 | | huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': . Use `repo_type` argument if needed. | 这是因为docker容器内访问不到model_name_or_path,需要挂载到物理机对应的目录。 | -------------------------------------------------------------------------------- /train/scripts/run_pt.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | export WANDB_PROJECT=... 4 | export WANDB_RUN_ID=... 5 | export WANDB_RESUME=allow 6 | export ABS_PATH=... 7 | export PYTHONPATH="$ABS_PATH/BELLE/train" 8 | model_name_or_path=/path_to_llm/hf_llama_7b/ # or bloomz-7b1-mt 9 | 10 | train_file=belleMath.json 11 | validation_file=belleMath-dev1K.json 12 | output_dir="$ABS_PATH/saved_models/${WANDB_PROJECT}_${WANDB_RUN_ID}" 13 | mkdir -p ${output_dir} 14 | 15 | cache_dir=hf_cache_dir 16 | mkdir -p ${cache_dir} 17 | cutoff_len=1024 18 | 19 | #FT 20 | # torchrun --nproc_per_node 8 src/entry_point/pt_train.py \ 21 | # --ddp_timeout 36000 \ 22 | # --model_name_or_path ${model_name_or_path} \ 23 | # --llama \ 24 | # --deepspeed configs/deepspeed_config.json \ 25 | # --train_file ${train_file} \ 26 | # --validation_file ${validation_file} \ 27 | # --per_device_train_batch_size 2 \ 28 | # --per_device_eval_batch_size 2 \ 29 | # --gradient_accumulation_steps 4 \ 30 | # --num_train_epochs 2 \ 31 | # --model_max_length ${cutoff_len} \ 32 | # --save_strategy "steps" \ 33 | # --save_total_limit 3 \ 34 | # --learning_rate 8e-6 \ 35 | # --weight_decay 0.00001 \ 36 | # --warmup_ratio 0.05 \ 37 | # --lr_scheduler_type "cosine" \ 38 | # --logging_steps 10 \ 39 | # --evaluation_strategy "steps" \ 40 | # --bf16 \ 41 | # --seed 1234 \ 42 | # --gradient_checkpointing \ 43 | # --cache_dir ${cache_dir} \ 44 | # --output_dir ${output_dir} \ 45 | # # --resume_from_checkpoint ... 46 | 47 | 48 | #LoRA with 8bit 49 | # torchrun --nproc_per_node 8 src/entry_point/pt_train.py \ 50 | # --ddp_timeout 36000 \ 51 | # --model_name_or_path ${model_name_or_path} \ 52 | # --llama \ 53 | # --use_lora \ 54 | # --use_int8_training \ 55 | # --lora_config configs/lora_config_llama.json \ 56 | # --train_file ${train_file} \ 57 | # --validation_file ${validation_file} \ 58 | # --per_device_train_batch_size 1 \ 59 | # --per_device_eval_batch_size 1 \ 60 | # --gradient_accumulation_steps 8 \ 61 | # --num_train_epochs 2 \ 62 | # --model_max_length ${cutoff_len} \ 63 | # --save_strategy "steps" \ 64 | # --save_total_limit 3 \ 65 | # --learning_rate 8e-6 \ 66 | # --weight_decay 0.00001 \ 67 | # --warmup_ratio 0.05 \ 68 | # --lr_scheduler_type "cosine" \ 69 | # --logging_steps 10 \ 70 | # --evaluation_strategy "steps" \ 71 | # --bf16 \ 72 | # --seed 1234 \ 73 | # --gradient_checkpointing \ 74 | # --cache_dir ${cache_dir} \ 75 | # --output_dir ${output_dir} \ 76 | # # --resume_from_checkpoint ... 77 | 78 | # LoRA without 8bit 79 | torchrun --nproc_per_node 8 src/entry_point/pt_train.py \ 80 | --ddp_timeout 36000 \ 81 | --model_name_or_path ${model_name_or_path} \ 82 | --llama \ 83 | --use_lora \ 84 | --deepspeed configs/deepspeed_config_stage3.json \ 85 | --lora_config configs/lora_config_llama.json \ 86 | --train_file ${train_file} \ 87 | --validation_file ${validation_file} \ 88 | --per_device_train_batch_size 1 \ 89 | --per_device_eval_batch_size 1 \ 90 | --gradient_accumulation_steps 1 \ 91 | --num_train_epochs 10 \ 92 | --model_max_length ${cutoff_len} \ 93 | --save_strategy "steps" \ 94 | --save_total_limit 3 \ 95 | --learning_rate 3e-4 \ 96 | --weight_decay 0.00001 \ 97 | --warmup_ratio 0.01 \ 98 | --lr_scheduler_type "cosine" \ 99 | --logging_steps 10 \ 100 | --evaluation_strategy "steps" \ 101 | --bf16 \ 102 | --seed 1234 \ 103 | --gradient_checkpointing \ 104 | --cache_dir ${cache_dir} \ 105 | --output_dir ${output_dir} \ 106 | # --resume_from_checkpoint ... 107 | -------------------------------------------------------------------------------- /train/scripts/run_multi_backend.py: -------------------------------------------------------------------------------- 1 | import fcntl 2 | import subprocess 3 | import os 4 | import signal 5 | import sys 6 | import time 7 | from transformers import HfArgumentParser 8 | from dataclasses import dataclass, field 9 | 10 | cuda_devices = os.getenv("CUDA_VISIBLE_DEVICES", "") 11 | if cuda_devices == "": 12 | num_process = 1 13 | cudas = [''] 14 | else: 15 | cudas = [cuda.strip() for cuda in cuda_devices.split(",")] 16 | num_process = len(cudas) 17 | 18 | @dataclass 19 | class Arguments: 20 | command: str = field(metadata={'help': 'command'}) 21 | 22 | args = HfArgumentParser((Arguments)).parse_args_into_dataclasses()[0] 23 | 24 | processes = [] 25 | outputs = {} # 用于存储子进程的输出 26 | 27 | # ANSI 转义序列颜色代码 28 | COLORS = [ 29 | "\033[31m", # 红色 30 | "\033[32m", # 绿色 31 | "\033[33m", # 黄色 32 | "\033[34m", # 蓝色 33 | "\033[35m", # 紫色 34 | "\033[36m", # 青色 35 | "\033[91m", # 浅红色 36 | "\033[92m", # 浅绿色 37 | "\033[93m", # 浅黄色 38 | "\033[94m", # 浅蓝色 39 | "\033[95m", # 浅紫色 40 | "\033[96m", # 浅青色 41 | ] 42 | 43 | def set_non_blocking(file): 44 | fd = file.fileno() 45 | flags = fcntl.fcntl(fd, fcntl.F_GETFL) 46 | fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK) 47 | 48 | def terminate_processes_and_exit(exit_code=0): 49 | print("终止子进程...") 50 | for process in processes: 51 | process.terminate() 52 | 53 | # 等待子进程终止 54 | for process in processes: 55 | process.wait() 56 | 57 | # 打印子进程的输出 58 | for local_rank, output in outputs.items(): 59 | color_code = COLORS[local_rank % len(COLORS)] # 根据进程编号选择颜色 60 | print(f"{color_code}rank: {local_rank} stdout: {output['stdout'].decode()}\033[0m") 61 | print(f"{color_code}rank: {local_rank} stderr: {output['stderr'].decode()}\033[0m") 62 | 63 | # 退出主进程 64 | sys.exit(exit_code) 65 | 66 | def handle_termination(signal, frame): 67 | print("收到终止信号,终止子进程...") 68 | terminate_processes_and_exit() 69 | 70 | signal.signal(signal.SIGINT, handle_termination) 71 | signal.signal(signal.SIGTERM, handle_termination) 72 | 73 | # 获取父进程的环境变量 74 | parent_env = os.environ.copy() 75 | print(f'cudas: {cudas}') 76 | 77 | for local_rank in range(num_process): 78 | # 设置新的环境变量 79 | env = parent_env.copy() 80 | env['CUDA_VISIBLE_DEVICES'] = f'{cudas[local_rank]}' 81 | command = f'{args.command} --local_rank {local_rank}' 82 | print(command) 83 | process = subprocess.Popen( 84 | command, 85 | shell=True, 86 | env=env, 87 | stdout=subprocess.PIPE, 88 | stderr=subprocess.PIPE, 89 | ) 90 | processes.append(process) 91 | outputs[local_rank] = { 92 | "stdout": b"", 93 | "stderr": b"", 94 | } 95 | 96 | for process in processes: 97 | set_non_blocking(process.stdout) 98 | set_non_blocking(process.stderr) 99 | 100 | try: 101 | while True: 102 | for local_rank, process in enumerate(processes): 103 | stdout = process.stdout.read() 104 | stderr = process.stderr.read() 105 | 106 | if stdout: 107 | new_output = stdout[len(outputs[local_rank]["stdout"]) :] 108 | outputs[local_rank]["stdout"] += new_output 109 | color_code = COLORS[local_rank % len(COLORS)] # 根据进程编号选择颜色 110 | print(f"{color_code}rank: {local_rank} stdout: {new_output.decode()}\033[0m") 111 | 112 | if stderr: 113 | new_output = stderr[len(outputs[local_rank]["stderr"]) :] 114 | outputs[local_rank]["stderr"] += new_output 115 | color_code = COLORS[local_rank % len(COLORS)] # 根据进程编号选择颜色 116 | print(f"{color_code}rank: {local_rank} stderr: {new_output.decode()}\033[0m") 117 | 118 | processes = [process for process in processes if process.poll() is None] 119 | 120 | if not processes: 121 | break 122 | 123 | time.sleep(0.1) 124 | except Exception as e: 125 | print("主脚本发生异常:", str(e)) 126 | terminate_processes_and_exit(1) 127 | 128 | -------------------------------------------------------------------------------- /train/scripts/debug.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | export CUDA_VISIBLE_DEVICES='0,1' 4 | export WANDB_PROJECT=debug 5 | export WANDB_RUN_ID=debug 6 | export WANDB_RESUME=allow 7 | export PYTHONPATH='/data/hanweiguang/Projects/BELLE/train' 8 | 9 | # model_name_or_path="decapoda-research/llama-7b-hf" 10 | model_name_or_path="bigscience/bloomz-560m" 11 | 12 | train_file=/data/hanweiguang/Projects/BELLE/data/test_data/test_pt.jsonl 13 | validation_file=/data/hanweiguang/Projects/BELLE/data/test_data/test_pt.jsonl 14 | output_dir=/data/hanweiguang/Projects/BELLE/saved_models/${WANDB_PROJECT}_${WANDB_RUN_ID} 15 | rm -rf $output_dir 16 | mkdir -p ${output_dir} 17 | 18 | cache_dir=hf_cache_dir 19 | mkdir -p ${cache_dir} 20 | cutoff_len=32 21 | 22 | 23 | # FT 24 | torchrun --nproc_per_node 2 src/entry_point/pt_train.py \ 25 | --model_name_or_path ${model_name_or_path} \ 26 | --deepspeed configs/deepspeed_config.json \ 27 | --train_file ${train_file} \ 28 | --validation_file ${validation_file} \ 29 | --per_device_train_batch_size 1 \ 30 | --per_device_eval_batch_size 1 \ 31 | --gradient_accumulation_steps 1 \ 32 | --num_train_epochs 1 \ 33 | --model_max_length ${cutoff_len} \ 34 | --save_strategy "steps" \ 35 | --save_total_limit 3 \ 36 | --learning_rate 1e-7 \ 37 | --weight_decay 0.00001 \ 38 | --warmup_ratio 0.05 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 10 \ 41 | --evaluation_strategy "steps" \ 42 | --seed 1234 \ 43 | --gradient_checkpointing \ 44 | --cache_dir ${cache_dir} \ 45 | --output_dir ${output_dir} \ 46 | --report_to "tensorboard" \ 47 | --bf16 \ 48 | # --fp16 \ 49 | # --llama 50 | 51 | # debug lora single node multiple gpus 52 | # torchrun --nproc_per_node 2 --rdzv-endpoint "127.0.0.1:30012" \ 53 | # "src/entry_point/pt_train.py" \ 54 | # --model_name_or_path ${model_name_or_path} \ 55 | # --train_file ${train_file} \ 56 | # --validation_file ${validation_file} \ 57 | # --per_device_train_batch_size 1 \ 58 | # --per_device_eval_batch_size 1 \ 59 | # --gradient_accumulation_steps 1 \ 60 | # --num_train_epochs 1 \ 61 | # --model_max_length ${cutoff_len} \ 62 | # --learning_rate 3e-4 \ 63 | # --weight_decay 0.00001 \ 64 | # --warmup_ratio 0.01 \ 65 | # --lr_scheduler_type "cosine" \ 66 | # --logging_steps 10 \ 67 | # --save_strategy "steps" \ 68 | # --save_total_limit 1 \ 69 | # --evaluation_strategy "steps" \ 70 | # --seed 1234 \ 71 | # --gradient_checkpointing \ 72 | # --cache_dir ${cache_dir} \ 73 | # --output_dir ${output_dir} \ 74 | # --report_to tensorboard \ 75 | # --use_lora \ 76 | # --lora_config configs/lora_config_bloom.json \ 77 | # --deepspeed configs/deepspeed_config_stage3.json \ 78 | # --bf16 \ 79 | # --fp16 \ 80 | # --use_int8_training \ 81 | # # --resume_from_checkpoint "/data/hanweiguang/Projects/BELLE/saved_models/boxue_debug_debug/checkpoint-5" \ 82 | # # --llama \ 83 | 84 | # master_addr='10.201.102.66' 85 | # master_port='65530' 86 | 87 | # # debug lora multiple nodes 88 | # torchrun \ 89 | # --nproc_per_node 1 \ 90 | # --nnode 2 \ 91 | # --node_rank $1 \ 92 | # --master_addr $master_addr \ 93 | # --master_port $master_port \ 94 | # 'src/entry_point/pt_train.py' \ 95 | # --model_name_or_path ${model_name_or_path} \ 96 | # --train_file ${train_file} \ 97 | # --validation_file ${validation_file} \ 98 | # --per_device_train_batch_size 1 \ 99 | # --per_device_eval_batch_size 1 \ 100 | # --gradient_accumulation_steps 1 \ 101 | # --num_train_epochs 40 \ 102 | # --model_max_length ${cutoff_len} \ 103 | # --learning_rate 3e-4 \ 104 | # --weight_decay 0.00001 \ 105 | # --warmup_ratio 0.01 \ 106 | # --lr_scheduler_type "cosine" \ 107 | # --logging_steps 10 \ 108 | # --save_strategy "steps" \ 109 | # --save_total_limit 3 \ 110 | # --evaluation_strategy "steps" \ 111 | # --fp16 \ 112 | # --seed 1234 \ 113 | # --gradient_checkpointing \ 114 | # --cache_dir ${cache_dir} \ 115 | # --output_dir ${output_dir} \ 116 | # --deepspeed configs/deepspeed_config_stage3.json \ 117 | # --lora_config configs/lora_config_bloom.json \ 118 | # --use_lora \ 119 | # # --llama \ -------------------------------------------------------------------------------- /train/src/sample_generator.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | from typing import Any, Dict, List 3 | import pudb 4 | import copy 5 | from transformers import PreTrainedTokenizer 6 | import json 7 | 8 | IGNORE_INDEX = -100 9 | 10 | 11 | def generate_and_tokenize_prompt( 12 | model_max_length: int, 13 | tokenizer: PreTrainedTokenizer, 14 | data_point: Dict[str, Any], 15 | fix_length=False, 16 | padding_side="left", 17 | ): 18 | input_ids = [] 19 | labels = [] 20 | source = data_point["conversations"] 21 | for sentence in source: 22 | sentence_from = sentence["from"].lower() 23 | sentence_value = ( 24 | "Human: \n" + sentence["value"] + "\n\nAssistant: \n" 25 | if sentence_from == "human" 26 | else sentence["value"] 27 | ) # https://github.com/LianjiaTech/BELLE/issues/337 28 | # conversation += sentence_value 29 | sentence_ids = tokenizer.encode( 30 | sentence_value, add_special_tokens=False 31 | ) # do not add bos_token_id 32 | label = ( 33 | copy.deepcopy(sentence_ids) 34 | if sentence_from != "human" 35 | else [IGNORE_INDEX] * len(sentence_ids) 36 | ) 37 | input_ids += sentence_ids 38 | labels += label 39 | # add eos at every end of assistant sentence 40 | if sentence_from != "human": 41 | input_ids += [tokenizer.eos_token_id] # make sure eos_token_id is correct 42 | labels += [tokenizer.eos_token_id] 43 | 44 | input_ids = input_ids[:model_max_length] 45 | labels = labels[:model_max_length] 46 | 47 | if all(x == IGNORE_INDEX for x in labels): 48 | labels[18:24] = input_ids[ 49 | 18:24 50 | ] # labels can not have all values being -100. 18 and 24 are just random numbers 51 | attention_mask = [1] * len(input_ids) 52 | 53 | if fix_length: 54 | if padding_side == "left": 55 | input_ids = [tokenizer.pad_token_id] * ( 56 | model_max_length - len(input_ids) 57 | ) + input_ids 58 | labels = [tokenizer.pad_token_id] * ( 59 | model_max_length - len(labels) 60 | ) + labels 61 | attention_mask = [0] * ( 62 | model_max_length - len(attention_mask) 63 | ) + attention_mask 64 | else: 65 | input_ids = input_ids + [tokenizer.pad_token_id] * ( 66 | model_max_length - len(input_ids) 67 | ) 68 | labels = labels + [tokenizer.pad_token_id] * ( 69 | model_max_length - len(labels) 70 | ) 71 | attention_mask = attention_mask + [0] * ( 72 | model_max_length - len(attention_mask) 73 | ) 74 | 75 | tokenized_full_prompt = { 76 | "input_ids": input_ids, 77 | "attention_mask": attention_mask, 78 | "labels": labels, 79 | } 80 | return tokenized_full_prompt 81 | 82 | 83 | def batch_grouped_pretrain_generate( 84 | model_max_length: int, 85 | tokenizer: PreTrainedTokenizer, 86 | examples: Dict[str, List[str]], 87 | ) -> Dict[str, List[List[int]]]: 88 | # build grouped texts with format `X1 X2 X3 ... X1 X2 X3 ... []` 89 | token_ids_list: List[List[int]] = tokenizer( 90 | examples["text"], add_special_tokens=False 91 | )["input_ids"] 92 | token_ids_list = [ 93 | token_ids + [tokenizer.eos_token_id] for token_ids in token_ids_list 94 | ] 95 | concatenated_ids = list(chain(*token_ids_list)) 96 | # we drop the small remainder, and if the total_length < block_size, we exclude this batch 97 | total_length = (len(concatenated_ids) // model_max_length) * model_max_length 98 | result = [ 99 | concatenated_ids[i : i + model_max_length] 100 | for i in range(0, total_length, model_max_length) 101 | ] 102 | return {"input_ids": result, "labels": result.copy()} 103 | 104 | 105 | def exam_generate(model_max_length: int, tokenizer: PreTrainedTokenizer, data_point): 106 | template = "Human: \n{human}\n\nAssistant: \n" 107 | # pudb.set_trace() 108 | input_str = template.format( 109 | human=f'回答下面的{data_point["type"]}题,用json返回答案,包括原因和答案,如{{"reason":..., "answer":...}}\n{data_point["question"]}\n选项:{" ".join(data_point["candidates"])}' 110 | ) 111 | input_ids = tokenizer.encode(input_str, add_special_tokens=False) 112 | labels = [IGNORE_INDEX] * len(input_ids) 113 | bot_ids = tokenizer.encode( 114 | json.dumps( 115 | {"reason": data_point["reason"], "answer": data_point["answer"]}, 116 | ensure_ascii=False, 117 | ), 118 | add_special_tokens=False, 119 | ) 120 | input_ids += bot_ids 121 | labels += bot_ids 122 | 123 | input_ids += [tokenizer.eos_token_id] 124 | labels += [tokenizer.eos_token_id] 125 | 126 | input_ids = input_ids[: model_max_length - 1] 127 | labels = labels[: model_max_length - 1] 128 | return { 129 | "input_ids": input_ids, 130 | "attention_mask": [1] * len(input_ids), 131 | "labels": labels, 132 | } 133 | -------------------------------------------------------------------------------- /train/reproduce_our_papers/Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md: -------------------------------------------------------------------------------- 1 | ### [Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://github.com/LianjiaTech/BELLE/blob/main/docs/Towards%20Better%20Instruction%20Following%20Language%20Models%20for%20Chinese.pdf) 2 | 3 | # 论文简介 4 | 5 | 这篇论文研究了基于当前公开的指令数据训练得到的模型,能达到怎样的效果。我们在自有的1000条测试集上进行了量化评估,同时,为了提高模型在中文领域的性能和训练/推理效率,我们进一步扩展了LLaMA的词汇表,并在34亿个中文词汇上进行了二次预训练。 6 | 7 | 我们使用到的公开指令训练数据有: 8 | 9 | 1. GPT-3.5生成的Stanford alpaca 中文以及英文数据 10 | 2. GPT-4生成的Alpaca 中文以及英文数据 11 | 3. 用户分享的与ChatGPT的真实对话数据sharegpt 12 | 13 | 我们着眼于探究训练数据类别对模型性能的影响。具体而言,我们考察了训练数据的数量、质量和语言分布等因素。 14 | 15 | 实验结果如下: 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 |
Factor Base model Training data Score_w/o_others
词表扩充 LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt 0.670
LLaMA-7B zh(alpaca-3.5&4) + sharegpt 0.652
数据质量 LLaMA-7B-EXT zh(alpaca-3.5) 0.642
LLaMA-7B-EXT zh(alpaca-4) 0.693
数据语言分布 LLaMA-7B-EXT cn(alpaca-3.5&4) 0.679
LLaMA-7B-EXT en(alpaca-3.5&4) 0.659
LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt 0.670
LLaMA-7B-EXT en(alpaca-3.5&4) + sharegpt 0.668
数据规模 LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt 0.670
LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt
+ BELLE-0.5M-CLEAN
0.762
-ChatGPT-0.824
83 | 84 | 其中**BELLE-0.5M-CLEAN**是从我们内部的230万指令数据中清洗得到0.5M数据,其中包含单轮和多轮对话数据,和之前开放的0.5M数据不是同一批数据。这份数据还未开源,但是我们已经将实验中效果最好的模型 (score 0.762) 开源在[Hugging Face](https://huggingface.co/BelleGroup/BELLE-on-Open-Datasets). 85 | 86 | # 准备数据集 87 | 88 | ### 下载数据集 89 | 90 | 我们的论文中使用了五个开源数据集: 91 | 92 | | Data | URL | 93 | | -------------- | -------------------------------------------------------------------------------------------------- | 94 | | alpaca-3.5-en | https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json | 95 | | alpaca-3.5-zh | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/dat | 96 | | alpaca-4-en | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data.json | 97 | | alpaca-4-zh | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data_zh.json | 98 | | sharegpt$^1$ | https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main | 99 | 100 | $^1$:请注意,sharegpt 数据集可能会持续更新,使得与我们论文中使用的略有不同,但不会对实验结论有大的影响。 101 | 102 | ### 预处理 103 | 104 | ##### 清洗 sharegpt 数据集 105 | 106 | 我们采用了[Vicuna](https://github.com/lm-sys/FastChat/blob/main/docs/commands/data_cleaning.md)中的数据清洗方法。 107 | 108 | 1. 通过 `fastchat.data.clean_sharegpt` 将 html 转换为 markdown 109 | 2. 通过 `fastchat.data.optional_clean` 删除除英文和中文之外的其他语言 110 | 3. 通过 `fastchat.data.split_long_conversation` 将最大长度为 2048 个令牌的对话分割开 111 | 112 | #### 统一数据格式 113 | 114 | 我们将所有数据集统一为以下形式: 115 | 116 | ```python 117 | { 118 | "id": "uniq_sample_id", 119 | "conversations": [ 120 | {"from": "human", "value": "你好"}, 121 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"}, 122 | {"from": "human", "value": "今天天气怎么样?"}, 123 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"} 124 | ] 125 | } 126 | 127 | ``` 128 | 129 | # 下载 LLaMA-EXT-7B 130 | 131 | LLaMA-EXT-7B 是基于 [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) 的模型,进一步在 34 亿中文词汇上进行预训练,将模型词表大小扩展为 79,458。 132 | 现在它已经在[Hugging Face上线](https://huggingface.co/BelleGroup/BELLE-LLaMA-EXT-7B),你应该基于它进行后续的模型训练。 133 | 134 | # 训练 135 | 136 | 以 LLaMA-EXT-7B 为基础模型,我们以如下超参数对模型进行训练。 137 | 138 | | Hyper parameter | Value | 139 | | ----------------- | ------ | 140 | | Precision | bf16 | 141 | | Epochs | 3 | 142 | | Batch size | 32 | 143 | | Learning rate | 5e-6 | 144 | | Weight decay | 0 | 145 | | Warmup ratio | 0.03 | 146 | | LR scheduler type | cosine | 147 | | Max length | 2048 | 148 | 149 | 可以使用我们开源的[代码](../README.md)进行模型训练。 150 | -------------------------------------------------------------------------------- /train/dschat_train_v1/reproduce_our_papers/Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md: -------------------------------------------------------------------------------- 1 | ### [Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://github.com/LianjiaTech/BELLE/blob/main/docs/Towards%20Better%20Instruction%20Following%20Language%20Models%20for%20Chinese.pdf) 2 | 3 | # 论文简介 4 | 5 | 这篇论文研究了基于当前公开的指令数据训练得到的模型,能达到怎样的效果。我们在自有的1000条测试集上进行了量化评估,同时,为了提高模型在中文领域的性能和训练/推理效率,我们进一步扩展了LLaMA的词汇表,并在34亿个中文词汇上进行了二次预训练。 6 | 7 | 我们使用到的公开指令训练数据有: 8 | 9 | 1. GPT-3.5生成的Stanford alpaca 中文以及英文数据 10 | 2. GPT-4生成的Alpaca 中文以及英文数据 11 | 3. 用户分享的与ChatGPT的真实对话数据sharegpt 12 | 13 | 我们着眼于探究训练数据类别对模型性能的影响。具体而言,我们考察了训练数据的数量、质量和语言分布等因素。 14 | 15 | 实验结果如下: 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 |
Factor Base model Training data Score_w/o_others
词表扩充 LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt 0.670
LLaMA-7B zh(alpaca-3.5&4) + sharegpt 0.652
数据质量 LLaMA-7B-EXT zh(alpaca-3.5) 0.642
LLaMA-7B-EXT zh(alpaca-4) 0.693
数据语言分布 LLaMA-7B-EXT cn(alpaca-3.5&4) 0.679
LLaMA-7B-EXT en(alpaca-3.5&4) 0.659
LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt 0.670
LLaMA-7B-EXT en(alpaca-3.5&4) + sharegpt 0.668
数据规模 LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt 0.670
LLaMA-7B-EXT zh(alpaca-3.5&4) + sharegpt
+ BELLE-0.5M-CLEAN
0.762
-ChatGPT-0.824
83 | 84 | 其中**BELLE-0.5M-CLEAN**是从我们内部的230万指令数据中清洗得到0.5M数据,其中包含单轮和多轮对话数据,和之前开放的0.5M数据不是同一批数据。这份数据还未开源,但是我们已经将实验中效果最好的模型 (score 0.762) 开源在[Hugging Face](https://huggingface.co/BelleGroup/BELLE-on-Open-Datasets). 85 | 86 | # 准备数据集 87 | 88 | ### 下载数据集 89 | 90 | 我们的论文中使用了五个开源数据集: 91 | 92 | | Data | URL | 93 | | -------------- | -------------------------------------------------------------------------------------------------- | 94 | | alpaca-3.5-en | https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json | 95 | | alpaca-3.5-zh | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/dat | 96 | | alpaca-4-en | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data.json | 97 | | alpaca-4-zh | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data_zh.json | 98 | | sharegpt$^1$ | https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main | 99 | 100 | $^1$:请注意,sharegpt 数据集可能会持续更新,使得与我们论文中使用的略有不同,但不会对实验结论有大的影响。 101 | 102 | ### 预处理 103 | 104 | ##### 清洗 sharegpt 数据集 105 | 106 | 我们采用了[Vicuna](https://github.com/lm-sys/FastChat/blob/main/docs/commands/data_cleaning.md)中的数据清洗方法。 107 | 108 | 1. 通过 `fastchat.data.clean_sharegpt` 将 html 转换为 markdown 109 | 2. 通过 `fastchat.data.optional_clean` 删除除英文和中文之外的其他语言 110 | 3. 通过 `fastchat.data.split_long_conversation` 将最大长度为 2048 个令牌的对话分割开 111 | 112 | #### 统一数据格式 113 | 114 | 我们将所有数据集统一为以下形式: 115 | 116 | ```python 117 | { 118 | "id": "uniq_sample_id", 119 | "conversations": [ 120 | {"from": "human", "value": "你好"}, 121 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"}, 122 | {"from": "human", "value": "今天天气怎么样?"}, 123 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"} 124 | ] 125 | } 126 | 127 | ``` 128 | 129 | # 下载 LLaMA-EXT-7B 130 | 131 | LLaMA-EXT-7B 是基于 [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) 的模型,进一步在 34 亿中文词汇上进行预训练,将模型词表大小扩展为 79,458。 132 | 现在它已经在[Hugging Face上线](https://huggingface.co/BelleGroup/BELLE-LLaMA-EXT-7B),你应该基于它进行后续的模型训练。 133 | 134 | # 训练 135 | 136 | 以 LLaMA-EXT-7B 为基础模型,我们以如下超参数对模型进行训练。 137 | 138 | | Hyper parameter | Value | 139 | | ----------------- | ------ | 140 | | Precision | bf16 | 141 | | Epochs | 3 | 142 | | Batch size | 32 | 143 | | Learning rate | 5e-6 | 144 | | Weight decay | 0 | 145 | | Warmup ratio | 0.03 | 146 | | LR scheduler type | cosine | 147 | | Max length | 2048 | 148 | 149 | 可以使用我们开源的[代码](../README.md)进行模型训练。 150 | -------------------------------------------------------------------------------- /train/src/callbacks.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import torch 3 | from tqdm import tqdm 4 | from transformers import ( 5 | TrainerCallback, 6 | Trainer, 7 | TrainingArguments, 8 | TrainerState, 9 | TrainerControl, 10 | PreTrainedModel, 11 | GenerationConfig 12 | ) 13 | from typing import Callable, Union 14 | import weakref 15 | from torch.utils.data import DataLoader, SequentialSampler 16 | import pudb 17 | 18 | 19 | class GenerationAndSaveBestModelsCallback(TrainerCallback): 20 | def __init__( 21 | self, 22 | trainer: Trainer, 23 | metric_fn: Callable[ 24 | [ 25 | Union[torch.Tensor, torch.cuda.FloatTensor], 26 | Union[torch.Tensor, torch.cuda.FloatTensor], 27 | ], 28 | float, 29 | ], 30 | eval_dataset, 31 | save_dir, 32 | num_best_models=3, 33 | eval_steps: int = 100, 34 | ): 35 | super().__init__() 36 | self.trainer_ref = weakref.ref(trainer) # 创建Trainer的弱引用 37 | ignored_columns = list( 38 | set(eval_dataset.column_names) - {'input_ids', 'labels'}) 39 | eval_dataset = eval_dataset.remove_columns(ignored_columns) 40 | self.eval_dataloader = DataLoader( 41 | eval_dataset, 42 | sampler=SequentialSampler(eval_dataset), 43 | batch_size=trainer.args.eval_batch_size, 44 | collate_fn=trainer.data_collator, 45 | drop_last=trainer.args.dataloader_drop_last, 46 | num_workers=trainer.args.dataloader_num_workers, 47 | pin_memory=trainer.args.dataloader_pin_memory, 48 | ) 49 | self.save_dir = save_dir 50 | self.num_best_models = num_best_models 51 | self.eval_steps = eval_steps 52 | self.best_eval_results = [] 53 | self.metric_fn = metric_fn 54 | 55 | self.generation_config = GenerationConfig( 56 | temperature=0.001, 57 | top_k=30, 58 | top_p=0.85, 59 | do_sample=True, 60 | num_beams=1, 61 | bos_token_id=1, 62 | eos_token_id=2, 63 | pad_token_id=0, 64 | repetition_penalty=1.2, 65 | max_new_tokens=1024, 66 | min_new_tokens=1, 67 | ) 68 | 69 | def on_step_end( 70 | self, 71 | args: TrainingArguments, 72 | state: TrainerState, 73 | control: TrainerControl, 74 | **kwargs, 75 | ): 76 | if state.global_step % self.eval_steps == 0: 77 | trainer = self.trainer_ref() # 获取Trainer对象的引用 78 | if trainer is None: 79 | return 80 | if args.local_rank != -1: 81 | torch.distributed.barrier() # 同步所有进程 82 | # TODO: 只实现了单个rank的eval, 还没实现多个rank并行eval 83 | if state.is_local_process_zero: 84 | model: PreTrainedModel = trainer.model 85 | model.eval() 86 | 87 | with torch.no_grad(): 88 | metric_value = 0 89 | total_samples = 0 90 | pudb.set_trace() 91 | for batch in tqdm(self.eval_dataloader, desc='Customize Eval'): 92 | input_ids = batch["input_ids"].to(model.device) 93 | generated_ids = model.generate( 94 | input_ids=input_ids, generation_config=self.generation_config) 95 | # 从generated_ids计算得到的指标值 96 | metric = self.metric_fn(generated_ids, batch["labels"]) 97 | metric_value += metric * input_ids.size(0) 98 | total_samples += input_ids.size(0) 99 | metric_value /= total_samples 100 | if len(self.best_eval_results) < self.num_best_models: 101 | self.best_eval_results.append( 102 | (metric_value, state.global_step)) 103 | # TODO: lora需要额外处理 104 | model.save_pretrained( 105 | f"{self.save_dir}/best_model_{state.global_step}" 106 | ) 107 | else: 108 | self.best_eval_results.sort() 109 | worst_result, worst_step = self.best_eval_results[0] 110 | if metric_value > worst_result: 111 | self.best_eval_results[0] = ( 112 | metric_value, 113 | state.global_step, 114 | ) 115 | # TODO: lora需要额外处理 116 | model.save_pretrained( 117 | f"{self.save_dir}/best_model_{state.global_step}" 118 | ) 119 | shutil.rmtree( 120 | f"{self.save_dir}/best_model_{worst_step}") 121 | if args.local_rank != -1: 122 | torch.distributed.barrier() # 同步所有进程 123 | 124 | 125 | def metric_fn(generated_ids, label_ids): 126 | # 这是一个简单的示例,实际的metric_fn应该根据你的具体任务和评估标准来定义 127 | return generated_ids.mean().item() 128 | -------------------------------------------------------------------------------- /train/dschat_train_v1/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import torch 7 | import random 8 | import numpy as np 9 | from transformers import set_seed 10 | import deepspeed 11 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus 12 | 13 | 14 | def print_rank_0(msg, rank=0): 15 | if rank <= 0: 16 | print(msg) 17 | 18 | 19 | def to_device(batch, device): 20 | output = {} 21 | for k, v in batch.items(): 22 | try: 23 | output[k] = v.to(device) 24 | except: 25 | output[k] = v 26 | return output 27 | 28 | 29 | class MovingAverage: 30 | 31 | def __init__(self): 32 | self.count = 0 33 | self.total = 0 34 | self.mean = 0 35 | 36 | def update(self, num): 37 | self.total += num 38 | self.count += 1 39 | self.mean = self.total / self.count 40 | 41 | return self.mean 42 | 43 | 44 | def save_hf_format(model, tokenizer, args, sub_folder=""): 45 | # used to save huggingface format, so we can use it for hf.from_pretrained 46 | model_to_save = model.module if hasattr(model, 'module') else model 47 | CONFIG_NAME = "config.json" 48 | WEIGHTS_NAME = "pytorch_model.bin" 49 | output_dir = os.path.join(args.output_dir, sub_folder) 50 | if not os.path.exists(output_dir): 51 | os.makedirs(output_dir) 52 | output_model_file = os.path.join(output_dir, WEIGHTS_NAME) 53 | output_config_file = os.path.join(output_dir, CONFIG_NAME) 54 | save_dict = model_to_save.state_dict() 55 | for key in list(save_dict.keys()): 56 | if "lora" in key: 57 | del save_dict[key] 58 | torch.save(save_dict, output_model_file) 59 | model_to_save.config.to_json_file(output_config_file) 60 | tokenizer.save_vocabulary(output_dir) 61 | 62 | 63 | def set_random_seed(seed): 64 | if seed is not None: 65 | set_seed(seed) 66 | random.seed(seed) 67 | np.random.seed(seed) 68 | torch.manual_seed(seed) 69 | torch.cuda.manual_seed_all(seed) 70 | 71 | 72 | def get_all_reduce_mean(tensor): 73 | torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) 74 | tensor = tensor / torch.distributed.get_world_size() 75 | return tensor 76 | 77 | 78 | def get_optimizer_grouped_parameters(model, 79 | weight_decay, 80 | no_decay_name_list=[ 81 | "bias", "LayerNorm.weight" 82 | ]): 83 | optimizer_grouped_parameters = [ 84 | { 85 | "params": [ 86 | p for n, p in model.named_parameters() 87 | if (not any(nd in n 88 | for nd in no_decay_name_list) and p.requires_grad) 89 | ], 90 | "weight_decay": 91 | weight_decay, 92 | }, 93 | { 94 | "params": [ 95 | p for n, p in model.named_parameters() 96 | if (any(nd in n 97 | for nd in no_decay_name_list) and p.requires_grad) 98 | ], 99 | "weight_decay": 100 | 0.0, 101 | }, 102 | ] 103 | return optimizer_grouped_parameters 104 | 105 | 106 | def _z3_params_to_fetch(param_list): 107 | return [ 108 | p for p in param_list 109 | if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE 110 | ] 111 | 112 | 113 | def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0): 114 | zero_stage_3 = (zero_stage == 3) 115 | with torch.no_grad(): 116 | for param, param_ema in zip(model.parameters(), 117 | model_ema.parameters()): 118 | # TODO: use prefiltering for efficiency 119 | params_to_fetch = _z3_params_to_fetch([param, param_ema 120 | ]) if zero_stage_3 else [] 121 | should_gather_param = len(params_to_fetch) > 0 122 | with deepspeed.zero.GatheredParameters( 123 | params_to_fetch, enabled=should_gather_param): 124 | data = param.data 125 | if device is not None: 126 | data = data.to(device) 127 | param_ema.data.copy_(torch.lerp(data, param_ema.data, beta)) 128 | 129 | 130 | def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0): 131 | zero_stage_3 = (zero_stage == 3) 132 | os.makedirs(save_dir, exist_ok=True) 133 | WEIGHTS_NAME = "pytorch_model.bin" 134 | output_model_file = os.path.join(save_dir, WEIGHTS_NAME) 135 | 136 | model_to_save = model_ema.module if hasattr(model_ema, 137 | 'module') else model_ema 138 | if not zero_stage_3: 139 | if global_rank == 0: 140 | torch.save(model_to_save.state_dict(), output_model_file) 141 | else: 142 | output_state_dict = {} 143 | for k, v in model_to_save.named_parameters(): 144 | 145 | if hasattr(v, 'ds_id'): 146 | with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v 147 | ]), 148 | enabled=zero_stage_3): 149 | v_p = v.data.cpu() 150 | else: 151 | v_p = v.cpu() 152 | if global_rank == 0 and "lora" not in k: 153 | output_state_dict[k] = v_p 154 | if global_rank == 0: 155 | torch.save(output_state_dict, output_model_file) 156 | del output_state_dict 157 | -------------------------------------------------------------------------------- /train/dschat_train_v1/prompt_eval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, AutoConfig 4 | import argparse 5 | from tqdm import tqdm 6 | import json, os 7 | parser = argparse.ArgumentParser() 8 | 9 | parser.add_argument('--model_name_or_path',required=True,type=str) 10 | parser.add_argument('--finetuned_model_name_or_path',required=True,type=str) 11 | parser.add_argument('--test_file',required=True,type=str) 12 | parser.add_argument('--predictions_file', default='./predictions.json', type=str) 13 | args = parser.parse_args() 14 | 15 | print("test_file: " + args.test_file) 16 | print("model_name_or_path: " + args.model_name_or_path) 17 | print("finetuned_model_name_or_path: " + args.finetuned_model_name_or_path) 18 | 19 | max_new_tokens = 1024 20 | generation_config = dict( 21 | temperature=0.001, 22 | top_k=30, 23 | top_p=0.85, 24 | do_sample=True, 25 | num_beams=1, 26 | repetition_penalty=1.2, 27 | max_new_tokens=max_new_tokens 28 | ) 29 | 30 | 31 | def read_data(filename): 32 | res = [] 33 | with open(filename, 'r', encoding='utf-8') as f: 34 | lines = f.readlines() 35 | for line in lines: 36 | res.append(json.loads(line.strip())) 37 | return res 38 | 39 | 40 | input_items = read_data(args.test_file) 41 | output_items = [] 42 | 43 | def write_data(filename, examples): 44 | with open(filename, 'w', encoding='utf-8') as f: 45 | for example in examples: 46 | f.write(json.dumps(example, ensure_ascii=False) + "\n") 47 | 48 | print("predictions will be written at {}".format(args.predictions_file)) 49 | 50 | def get_input_text(input_item): 51 | conversations = input_item['conversations'] 52 | conv_turn = len(conversations) 53 | for i, sentence in conversations: 54 | sentence_from = sentence["from"].lower() 55 | sentence_value = 'Human: ' + sentence["value"] + '\n\nAssistant: ' if sentence_from == 'human' else sentence["value"] 56 | conversation += sentence_value 57 | sentence_ids = tokenizer.encode(sentence_value, add_special_tokens=False)#do not add bos_token_id 58 | label = copy.deepcopy(sentence_ids) if sentence_from != 'human' else [IGNORE_INDEX] * len(sentence_ids) 59 | input_ids += sentence_ids 60 | 61 | def _addrole_masklabel_tokenize(source): 62 | ''' 63 | add speaker and concatenate the sentences 64 | { 65 | "id": "uniq_sample_id", 66 | "conversations": [ 67 | {"from": "human", "value": "你好"}, 68 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"}, 69 | {"from": "human", "value": "今天天气怎么样?"}, 70 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"} 71 | ] 72 | } 73 | tokenizer_bloomz.encode("你好,有什么可以帮助你的吗?") == [41381, 355, 37242, 205599, 7336, 10468] 74 | tokenizer_llama.encode("你好,有什么可以帮助你的吗?") == [1, 29871, 30919, 31076, 30214, 30417, 231, 190, 131, 31882, 30682, 30651, 232, 187, 177, 31931, 30919, 30210, 232, 147, 154, 30882] 75 | ''' 76 | 77 | conversation = '' 78 | input_ids = [] 79 | for sentence in source[:-1]: 80 | sentence_from = sentence["from"].lower() 81 | sentence_value = 'Human: ' + sentence["value"] + '\n\nAssistant: ' if sentence_from == 'human' else sentence["value"] 82 | conversation += sentence_value 83 | sentence_ids = tokenizer.encode(sentence_value, add_special_tokens=False)#do not add bos_token_id 84 | input_ids += sentence_ids 85 | if sentence_from != 'human': 86 | input_ids += [tokenizer.eos_token_id]#make sure eos_token_id is correct 87 | 88 | return input_ids, conversation 89 | 90 | if __name__ == '__main__': 91 | load_type = torch.float16 92 | if torch.cuda.is_available(): 93 | device = torch.device(0) 94 | else: 95 | device = torch.device('cpu') 96 | 97 | if "llama" in args.model_name_or_path: 98 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path) 99 | else: 100 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 101 | 102 | tokenizer.pad_token_id = 0 103 | tokenizer.eos_token_id = 2 104 | model_config = AutoConfig.from_pretrained(args.model_name_or_path) 105 | print("Loading model") 106 | model = AutoModelForCausalLM.from_pretrained( 107 | args.finetuned_model_name_or_path, 108 | torch_dtype=load_type, 109 | config=model_config, 110 | ) 111 | 112 | model.to(device) 113 | model.eval() 114 | print("Load model successfully") 115 | 116 | index = 0 117 | 118 | for i in tqdm(range(len(input_items)), total=len(input_items), unit="item"): 119 | batch_input_items = input_items[i] 120 | 121 | input_ids, conversation = _addrole_masklabel_tokenize(source=input_items[i]['conversations']) 122 | input_ids = input_ids[:2048] 123 | if "Human" not in conversation: 124 | continue 125 | attention_mask = [1] * len(input_ids) 126 | input_ids = torch.LongTensor(input_ids).unsqueeze(0) 127 | attention_mask = torch.LongTensor(attention_mask).unsqueeze(0) 128 | #(1, max_seq_len) 129 | generation_output = model.generate( 130 | input_ids = input_ids.to(device), 131 | attention_mask = attention_mask.to(device), 132 | **generation_config 133 | ) 134 | 135 | generate_text = tokenizer.decode(generation_output[0].cpu().tolist(),skip_special_tokens=True) 136 | output_items.append({"generate_text": generate_text}) 137 | if index%10 == 0: 138 | print("conversation: ", conversation) 139 | print("generate_text: ", generate_text) 140 | print("-"*100) 141 | index += 1 142 | 143 | write_data(args.predictions_file, output_items) 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CALM: Credit and Risk Assessment Large Language Model 2 | 3 | - Due to licensing restrictions on [LLaMA](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) weights, the model cannot be used for commercial purposes. Please adhere strictly to LLaMA's usage policy. 4 | - Considering the limitations of LLaMA's license, we cannot directly distribute the complete model weights. Here, we are only releasing the LoRA weights of [CALM-7B](https://huggingface.co/daishen/CALM-7B). 5 | 6 | ## Content 7 | 8 | - [CALM: Credit and Risk Assessment Large Language Model](#calm-credit-and-risk-assessment-large-language-model) 9 | - [Content](#content) 10 | - [1. Preparing the environment](#1-preparing-the-environment) 11 | - [2. Run](#2-run) 12 | - [2.1 Download data](#21-download-data) 13 | - [2.1.1 Convert data format](#211-convert-data-format) 14 | - [2.2 Model training](#22-model-training) 15 | - [2.2.1 LoRA](#221-lora) 16 | - [2.2.2 Merge Model with LORA](#222-merge-model-with-lora) 17 | 18 | ## 1. Preparing the environment 19 | 20 | Creating the environment using Conda, followed by installing the required packages using pip. 21 | 22 | ```bash 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ## 2. Run 27 | 28 | ### 2.1 Download data 29 | 30 | Before running, please download [rawdata](https://huggingface.co/datasets/daishen/CALM-Data/tree/main) to `data/CRA_resample_0.045M.json` 31 | 32 | #### 2.1.1 Convert data format 33 | 34 | ```bash 35 | export raw_data=/path_to/CRA_resample_0.045M.json 36 | export conv_data=/path_to/CRA_resample_0.045M_conv.json 37 | export data_name=CRA 38 | export dev_data=/path_to/CRA-resample-dev3k.json 39 | export train_data=/path_to/CRA-resample-train4w.json 40 | 41 | python scripts/convert_to_conv_data.py \ 42 | --orig_data ${raw_data} \ 43 | --write_data ${conv_data} \ 44 | --dataset_name CRA 45 | head -n 3000 ${conv_data} > ${dev_data} 46 | tail -n +3001 ${conv_data} > ${train_data} 47 | ``` 48 | 49 | We designate the first 3000 entries as the validation set, while the remaining data serves as the training set. 50 | 51 | ### 2.2 Model training 52 | 53 | Training strategy 54 | 55 | * LoRA + int8 56 | 57 | The initiation script for training is written in `train/scripts/run.sh`. You will need to modify the parameters in `run.sh` according to your specific requirements. 58 | 59 | ```bash 60 | bash scripts/run_sft.sh 61 | ``` 62 | 63 | - model_name_or_path: The pretrained model (if it is an [LLaMA](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model, it needs to be converted to the hf format beforehand in order to be loaded using from_pretrained) 64 | - train_file: Training data 65 | - validation_file: Validation data 66 | - output_dir: Path to the training logs and model saves 67 | - cache_dir: Path to the cache data processing process 68 | - cutoff_len: Maximum input sequence length ([LLaMA](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model suggests setting it to 1024 or above, [Bloom](https://huggingface.co/bigscience/bloom) model suggests setting it to 512 or above) 69 | 70 | #### 2.2.1 LoRA 71 | 72 | ```bash 73 | nohup torchrun --nproc_per_node 2 src/entry_point/sft_train.py \ 74 | --model_name_or_path ${model_name_or_path} \ 75 | --bf16 True \ 76 | --llama True \ 77 | --use_lora True \ 78 | --deepspeed configs/deepspeed_config_stage3.json \ 79 | --lora_config configs/lora_config_llama.json \ 80 | --train_file ${train_file} \ 81 | --validation_file ${validation_file} \ 82 | --per_device_train_batch_size 6 \ 83 | --per_device_eval_batch_size 6 \ 84 | --gradient_accumulation_steps 1 \ 85 | --num_train_epochs 5 \ 86 | --model_max_length ${cutoff_len} \ 87 | --save_strategy "steps" \ 88 | --save_total_limit 3 \ 89 | --learning_rate 3e-4 \ 90 | --weight_decay 0.00001 \ 91 | --warmup_ratio 0.01 \ 92 | --lr_scheduler_type "cosine" \ 93 | --logging_steps 10 \ 94 | --evaluation_strategy "steps" \ 95 | --seed 1234 \ 96 | --gradient_checkpointing \ 97 | --cache_dir ${cache_dir} \ 98 | --output_dir ${output_dir} \ 99 | --overwrite_output_dir \ 100 | > ${log_dir}/train.log 2>&1 & 101 | ``` 102 | 103 | **Parameters** 104 | 105 | * use_lora: Training using LoRA 106 | * use_int8_training: Training with 8-bit quantization, which significantly reduces memory usage 107 | * lora_config: The parameter configuration for LoRA is provided. If training a [Bloom](https://huggingface.co/bigscience/bloom) model, it should be changed to "`configs/lora_config_bloom.json`" 108 | * deepspeed When training sequences are long, it is recommended to utilize deepspeed stage 3, which effectively distributes model parameters across multiple cards, allowing room to load even longer sequences 109 | 110 | **Note:** Please be aware that you can only choose between "`use_int8_training`" and "`deepspeed`"; they cannot be used simultaneously. 111 | 112 | The structure of the **output_dir**: 113 | 114 | ``` 115 | output_dir/ 116 | ├── checkpoint-244/ 117 | │ ├── pytorch_model.bin 118 | │ └── trainer_state.json 119 | ├── checkpoint-527/ 120 | │ ├── pytorch_model.bin 121 | │ └── trainer_state.json 122 | ├── adapter_model.bin 123 | ├── print_log.txt 124 | └── adapter_config.json 125 | ``` 126 | 127 | The highest-level directory stores the final model obtained from the training process. 128 | 129 | #### 2.2.2 Merge Model with LORA 130 | 131 | If you wish to merge the weights of LoRA with a pre-trained model, you can execute the following command: 132 | 133 | ```bash 134 | model_name_or_path=model_path_to/llama-2-7b-chat-T/ 135 | lora_path=lora_path_to/checkpoint_2/3739 136 | output_path=out_path_to/CRA__model_2/model_3739 137 | 138 | CUDA_VISIBLE_DEVICES=0 python src/merge_llama_with_lora.py \ 139 | --model_name_or_path ${model_name_or_path} \ 140 | --output_path ${output_path} \ 141 | --lora_path ${lora_path} \ 142 | --llama 143 | ``` 144 | 145 | The merged weights will be saved in the "`output_path`" directory. You can subsequently load them directly using "`from_pretrained`". 146 | -------------------------------------------------------------------------------- /train/debug/app.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Union 2 | from peft import PeftModel 3 | import torch 4 | from transformers import LlamaTokenizer, AutoModelForCausalLM, GenerationConfig 5 | from flask import Flask, render_template, request 6 | import sys 7 | sys.path.append("../src") 8 | 9 | 10 | def load_model(): 11 | model_name_or_path = '/nfs/a100-80G-17/jiyunjie/finetuned_ckpt/on_belle_tokenizer50k_openinstr_zh/zh_alpaca_gpt3.5_gpt4_sharegpt_epoch=2-step=20652' 12 | ckpt_path = '/nfs/a100-006/hanweiguang/saved_model/boxue_1_bs-8_lr-3e-4_wm-1e-2_epoch-10_lora' 13 | 14 | print('Loading model...') 15 | # Initialize the model and tokenizer 16 | load_type = torch.float16 17 | tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path) 18 | tokenizer.pad_token_id = 0 19 | tokenizer.bos_token_id = 1 20 | tokenizer.eos_token_id = 2 21 | tokenizer.padding_side = "left" 22 | base_model = AutoModelForCausalLM.from_pretrained( 23 | model_name_or_path, torch_dtype=load_type 24 | ) 25 | model = PeftModel.from_pretrained( 26 | base_model, ckpt_path, torch_dtype=load_type 27 | ) 28 | model.eval() 29 | model.to('cuda:0') 30 | print('Model loaded!') 31 | return model, tokenizer 32 | 33 | # Initialize flask 34 | app = Flask(__name__) 35 | 36 | model, tokenizer = None, None 37 | 38 | 39 | @app.before_first_request 40 | def initialize_model(): 41 | global model, tokenizer 42 | model, tokenizer = load_model() 43 | 44 | 45 | def get_probability_per_token(input_ids: torch.Tensor) -> Tuple[List[float], float]: 46 | """ 47 | input_ids: [1, sen_len] 48 | """ 49 | with torch.no_grad(): 50 | outputs = model(input_ids=input_ids, labels=input_ids) 51 | logits = outputs.logits 52 | probs = logits.softmax(dim=-1) 53 | # Select the probabilities of the generated tokens 54 | # [1, sen_len, 1] 55 | generated_token_probs = torch.gather( 56 | probs[:, :-1, :], 2, input_ids[:, 1:, None]) # skip the first token_id 57 | return generated_token_probs[0, ..., 0].tolist(), outputs.loss.item() 58 | 59 | 60 | def decode_and_get_probability(input_text: str, generation_config: GenerationConfig): 61 | input_ids = tokenizer.encode(input_text, return_tensors='pt').to( 62 | getattr(model, 'module', model).device) 63 | with torch.no_grad(): 64 | output = model.generate(input_ids=input_ids, 65 | generation_config=generation_config) 66 | token_ids = output[0].tolist() 67 | tokens = [tokenizer.decode([token_id], skip_special_tokens=False) 68 | for token_id in token_ids] 69 | token_probs_list, loss = get_probability_per_token(output) 70 | 71 | # The probability of is set to 0 72 | tokens_with_probs = [{'token': tokens[0], 'prob': 0.0}] 73 | for token, prob in zip(tokens[1:], token_probs_list): 74 | tokens_with_probs.append({ 75 | 'token': token, 76 | 'prob': prob 77 | }) 78 | 79 | return tokens_with_probs, loss 80 | 81 | 82 | def get_probability_of_text(input_text: str) -> List[Dict[str, Union[float, int]]]: 83 | input_ids = tokenizer.encode( 84 | input_text, return_tensors='pt').to( 85 | getattr(model, 'module', model).device 86 | ) 87 | token_ids = input_ids[0].tolist() 88 | tokens = [tokenizer.decode([token_id], skip_special_tokens=False) 89 | for token_id in token_ids] 90 | token_probs_list, loss = get_probability_per_token(input_ids) 91 | 92 | # The probability of is set to 0 93 | tokens_with_probs = [{'token': tokens[0], 'prob': 0.0}] 94 | for token, prob in zip(tokens[1:], token_probs_list): 95 | tokens_with_probs.append({ 96 | 'token': token, 97 | 'prob': prob 98 | }) 99 | 100 | return tokens_with_probs, loss 101 | 102 | 103 | @app.route("/inference", methods=['POST']) 104 | def inference(): 105 | input_text = request.form.get('input_text') 106 | if input_text is None: 107 | # 处理找不到输入文本的情况 108 | return render_template('error.html', message='Input text is missing') 109 | tokens_with_probs, loss = get_probability_of_text(input_text) 110 | return render_template('output.html', tokens_with_probs=tokens_with_probs, loss=loss) 111 | 112 | 113 | @app.route('/generate', methods=['POST']) 114 | def generate(): 115 | input_text = request.form.get('input_text') 116 | if input_text is None: 117 | # 处理找不到输入文本的情况 118 | return render_template('error.html', message='Input text is missing') 119 | 120 | # Fetch the parameters from the form 121 | temperature = float(request.form.get('temperature')) 122 | top_p = float(request.form.get('top_p')) 123 | top_k = int(request.form.get('top_k')) 124 | num_beams = int(request.form.get('num_beams')) 125 | max_new_tokens = int(request.form.get('max_new_tokens')) 126 | min_new_tokens = int(request.form.get('min_new_tokens')) 127 | repetition_penalty = float(request.form.get('repetition_penalty')) 128 | do_sample = bool(request.form.get('do_sample')) 129 | 130 | generation_config = GenerationConfig( 131 | temperature=temperature, 132 | top_p=top_p, 133 | top_k=top_k, 134 | num_beams=num_beams, 135 | bos_token_id=1, 136 | eos_token_id=2, 137 | pad_token_id=0, 138 | max_new_tokens=max_new_tokens, # max_length=max_new_tokens+input_sequence 139 | min_new_tokens=min_new_tokens, # min_length=min_new_tokens+input_sequence 140 | repetition_penalty=repetition_penalty, 141 | do_sample=do_sample, 142 | ) 143 | 144 | tokens_with_probs, loss = decode_and_get_probability( 145 | input_text, generation_config) 146 | return render_template('output.html', tokens_with_probs=tokens_with_probs, loss=loss) 147 | 148 | 149 | @app.route('/') 150 | def home(): 151 | return render_template('index.html') 152 | 153 | 154 | if __name__ == "__main__": 155 | app.run(debug=True, port=5000) 156 | -------------------------------------------------------------------------------- /train/dschat_train_v1/utils/module/lora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import math 6 | import torch 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from deepspeed.compression.helper import recursive_getattr, recursive_setattr 10 | import deepspeed 11 | 12 | 13 | class LinearLayer_LoRA(nn.Module): 14 | # an simple implementation of LoRA 15 | # for now only support Linear Layer 16 | def __init__(self, 17 | weight, 18 | lora_dim=0, 19 | lora_alpha=16, 20 | lora_droppout=0.05, 21 | bias=None): 22 | super(LinearLayer_LoRA, self).__init__() 23 | self.weight = weight 24 | self.bias = bias 25 | 26 | if lora_dim <= 0: 27 | raise ValueError( 28 | "You are training to use LoRA, whose reduced dim should be larger than 1" 29 | ) 30 | 31 | try: 32 | # for zero stage 3 33 | rows, columns = weight.ds_shape 34 | except: 35 | rows, columns = weight.shape 36 | self.lora_right_weight = nn.Parameter(torch.zeros( 37 | columns, 38 | lora_dim)) # apply transpose so in forward we do not need to 39 | self.lora_left_weight = nn.Parameter(torch.zeros(lora_dim, rows)) 40 | self.lora_scaling = lora_alpha / lora_dim #alpha/rank 41 | 42 | if lora_droppout > 0: 43 | self.lora_dropout = nn.Dropout(lora_droppout) 44 | else: 45 | self.lora_dropout = nn.Identity() 46 | 47 | self.reset_parameters() 48 | # disable the original weight gradient 49 | self.weight.requires_grad = False 50 | # fuse LoRA to the original weight 51 | self.fuse_lora = False 52 | 53 | def eval(self): 54 | self.lora_dropout.eval() 55 | 56 | # self.fuse_lora_weight() 57 | 58 | def train(self, mode=True): 59 | self.lora_dropout.train(mode) 60 | # self.unfuse_lora_weight() 61 | 62 | def reset_parameters(self): 63 | nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5)) 64 | nn.init.zeros_(self.lora_left_weight) 65 | 66 | def fuse_lora_weight(self): 67 | if not self.fuse_lora: 68 | self.weight.data += self.lora_scaling * torch.matmul( 69 | self.lora_left_weight.t(), self.lora_right_weight.t()) 70 | self.fuse_lora = True 71 | 72 | def unfuse_lora_weight(self): 73 | if self.fuse_lora: 74 | self.weight.data -= self.lora_scaling * torch.matmul( 75 | self.lora_left_weight.t(), self.lora_right_weight.t()) 76 | self.fuse_lora = False 77 | 78 | def forward(self, input): 79 | if self.fuse_lora: 80 | return F.linear(input, self.weight, self.bias) 81 | else: 82 | return F.linear( 83 | input, self.weight, 84 | self.bias) + (self.lora_dropout(input) @ self.lora_right_weight 85 | @ self.lora_left_weight) * self.lora_scaling 86 | 87 | 88 | ''' 89 | LLaMA model named_modules: 90 | model.layers.29.mlp 91 | model.layers.29.mlp.gate_proj 92 | model.layers.29.mlp.down_proj 93 | model.layers.29.mlp.up_proj 94 | model.layers.29.mlp.act_fn 95 | model.layers.29.input_layernorm 96 | model.layers.29.post_attention_layernorm 97 | model.layers.30 98 | model.layers.30.self_attn 99 | model.layers.30.self_attn.q_proj 100 | model.layers.30.self_attn.k_proj 101 | model.layers.30.self_attn.v_proj 102 | model.layers.30.self_attn.o_proj 103 | model.layers.30.self_attn.rotary_emb 104 | ''' 105 | # convert the linear layer to LoRA 106 | def convert_linear_layer_to_lora(model, 107 | lora_module_name, 108 | lora_dim=0, 109 | lora_alpha=1, 110 | lora_droppout=0): 111 | def set_params(param_modules, name): 112 | for each_ in param_modules: 113 | if each_ in name: 114 | return True 115 | return False 116 | 117 | repalce_name = [] 118 | for name, module in model.named_modules(): 119 | if isinstance(module, nn.Linear) and set_params(lora_module_name, name): 120 | repalce_name.append(name) 121 | 122 | print("repalce_name : ", repalce_name) 123 | for name in repalce_name: 124 | module = recursive_getattr(model, name) 125 | tmp = LinearLayer_LoRA( 126 | module.weight, lora_dim, lora_alpha, lora_droppout, 127 | module.bias).to(module.weight.device).to(module.weight.dtype) 128 | recursive_setattr(model, name, tmp) 129 | return model 130 | 131 | 132 | 133 | def _z3_params_to_fetch(param_list): 134 | return [ 135 | p for p in param_list 136 | if hasattr(p, 'ds_id') and p.ds_status == deepspeed.runtime.zero. 137 | partition_parameters.ZeroParamStatus.NOT_AVAILABLE 138 | ] 139 | 140 | 141 | # convert the LoRA layer to linear layer 142 | def convert_lora_to_linear_layer(model): 143 | repalce_name = [] 144 | for name, module in model.named_modules(): 145 | if isinstance(module, LinearLayer_LoRA): 146 | repalce_name.append(name) 147 | for name in repalce_name: 148 | module = recursive_getattr(model, name) 149 | zero_stage_3 = hasattr(module.weight, 'ds_id') 150 | with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([ 151 | module.weight, module.bias, module.lora_left_weight, 152 | module.lora_right_weight 153 | ]), 154 | modifier_rank=0, 155 | enabled=zero_stage_3): 156 | module.fuse_lora_weight() 157 | return model 158 | 159 | 160 | def only_optimize_lora_parameters(model): 161 | # turn off the gradient of all the parameters except the LoRA parameters 162 | for name, param in model.named_parameters(): 163 | if "lora_right_weight" in name or "lora_left_weight" in name: 164 | param.requires_grad = True 165 | else: 166 | param.requires_grad = False 167 | return model 168 | -------------------------------------------------------------------------------- /train/dschat_train_v1/utils/data/raw_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from datasets import disable_caching 6 | disable_caching() 7 | from datasets import load_dataset 8 | from torch.utils.data import Subset 9 | import re 10 | import os 11 | 12 | 13 | # The template prompt dataset class that all new dataset porting needs to 14 | # follow in order to have a unified API and unified data format. 15 | class PromptRawDataset(object): 16 | 17 | def __init__(self, output_path, seed, local_rank): 18 | self.output_path = output_path 19 | self.seed = seed 20 | self.local_rank = local_rank 21 | 22 | def get_train_data(self): 23 | return 24 | 25 | def get_eval_data(self): 26 | return 27 | 28 | # The prompt should be in the format of: " Human: " + actual_prompt_sentence + " Assistant:" 29 | def get_prompt(self, sample): 30 | return 31 | 32 | # The chosen response should be in the format of: " " + actual_response_sentence 33 | def get_chosen(self, sample): 34 | return 35 | 36 | # The rejected response should be in the format of: " " + actual_response_sentence 37 | # If the dataset does not have rejected response, return None 38 | def get_rejected(self, sample): 39 | return 40 | 41 | def get_prompt_and_chosen(self, sample): 42 | return 43 | 44 | def get_prompt_and_rejected(self, sample): 45 | return 46 | 47 | 48 | # English dataset. https://huggingface.co/datasets/Dahoas/rm-static 49 | class DahoasRmstaticDataset(PromptRawDataset): 50 | 51 | def __init__(self, output_path, seed, local_rank): 52 | super().__init__(output_path, seed, local_rank) 53 | self.dataset_name = "Dahoas/rm-static" 54 | self.dataset_name_clean = "Dahoas_rm_static" 55 | self.raw_datasets = load_dataset("Dahoas/rm-static") 56 | 57 | def get_train_data(self): 58 | return self.raw_datasets["train"] 59 | 60 | def get_eval_data(self): 61 | return self.raw_datasets["test"] 62 | 63 | def get_prompt(self, sample): 64 | return sample['prompt'] 65 | 66 | def get_chosen(self, sample): 67 | return sample['chosen'] 68 | 69 | def get_rejected(self, sample): 70 | return sample['rejected'] 71 | 72 | def get_prompt_and_chosen(self, sample): 73 | return sample['prompt'] + sample['chosen'] 74 | 75 | def get_prompt_and_rejected(self, sample): 76 | return sample['prompt'] + sample['rejected'] 77 | 78 | 79 | # # Belleschool_math0.25K 80 | # class BelleOpenSoucreDataset(PromptRawDataset): 81 | 82 | # def __init__(self, output_path, seed, local_rank, data_file): 83 | # eval_data_file = "utils/data/dev1K.json" 84 | # super().__init__(output_path, seed, local_rank) 85 | # self.dataset_name = "BelleOpenSoucre" 86 | # self.dataset_name_clean = "BelleOpenSoucre" 87 | # dataset_cache_dir = "output/data_files" 88 | # print("data_file = ", data_file) 89 | # self.raw_datasets = load_dataset("json", data_files=data_file, cache_dir=dataset_cache_dir) 90 | # self.raw_datasets.cleanup_cache_files() 91 | # self.dev_raw_datasets = load_dataset("json", data_files=eval_data_file, cache_dir=dataset_cache_dir) 92 | # self.dev_raw_datasets.cleanup_cache_files() 93 | # print(self.raw_datasets["train"]) 94 | 95 | # def get_train_data(self): 96 | # return self.raw_datasets["train"] 97 | 98 | # def get_eval_data(self): 99 | # return self.dev_raw_datasets["train"] 100 | 101 | # def get_prompt(self, sample): 102 | # return "Human: "+sample['instruction']+sample['input']+"\n Assistant: " 103 | 104 | # def get_chosen(self, sample): 105 | # return "Human: "+sample['instruction']+sample['input']+"\n Assistant: " 106 | 107 | 108 | # def get_prompt_and_chosen(self, sample): 109 | # return "Human: "+sample['instruction']+sample['input']+"\n Assistant: "+sample['output'] 110 | 111 | 112 | 113 | class BelleOpenSoucreDataset(PromptRawDataset): 114 | 115 | def __init__(self, output_path, seed, local_rank, data_file, eval_data_file=None): 116 | ''' 117 | { 118 | "id": "uniq_sample_id", 119 | "conversations": [ 120 | {"from": "human", "value": "你好"}, 121 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"}, 122 | {"from": "human", "value": "今天天气怎么样?"}, 123 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"} 124 | ] 125 | } 126 | LlamaTokenizer会自动加上bos_token_id,但是BloomTokenizer不会加上bos_token_id 127 | 两个tokenizer的bos_token_id和eos_token_id是相同的,pad_token_id强制设置为0 128 | ''' 129 | # eval_data_file = "utils/data/dev1K.json" 130 | 131 | super().__init__(output_path, seed, local_rank) 132 | self.dataset_name = "BelleOpenSoucre" 133 | self.dataset_name_clean = "BelleOpenSoucre" 134 | dataset_cache_dir = "output/data_files" 135 | print("data_file = ", data_file) 136 | self.raw_datasets = load_dataset("json", data_files=data_file, cache_dir=dataset_cache_dir) 137 | self.raw_datasets.cleanup_cache_files() 138 | 139 | if eval_data_file!=None and os.path.exists(eval_data_file): 140 | print("eval_data_file = ", eval_data_file) 141 | self.dev_raw_datasets = load_dataset("json", data_files=eval_data_file, cache_dir=dataset_cache_dir) 142 | self.dev_raw_datasets.cleanup_cache_files() 143 | self.train_data = self.raw_datasets["train"] 144 | self.eval_data = self.dev_raw_datasets["train"] 145 | else: 146 | train_val = self.raw_datasets["train"].train_test_split( 147 | test_size=1000, shuffle=True, seed=42 148 | ) 149 | self.train_data = train_val["train"] 150 | self.eval_data = train_val["test"] 151 | 152 | print("train_data: ", self.train_data) 153 | print("eval_data: ", self.eval_data) 154 | 155 | 156 | def get_train_data(self): 157 | return self.train_data 158 | 159 | def get_eval_data(self): 160 | return self.eval_data 161 | 162 | def get_conversations(self, sample): 163 | return sample['conversations'] 164 | -------------------------------------------------------------------------------- /train/src/entry_point/interface.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | import os 4 | import deepspeed 5 | import gradio as gr 6 | import torch 7 | from peft import PeftModel 8 | from transformers import ( 9 | AutoConfig, 10 | AutoModelForCausalLM, 11 | AutoTokenizer, 12 | GenerationConfig, 13 | LlamaTokenizer, 14 | ) 15 | import sys 16 | import traceback 17 | # import pudb 18 | 19 | # 异常时中断 20 | # def debug_on_exception(exctype, value, tb): 21 | # traceback.print_exception(exctype, value, tb) 22 | # pudb.post_mortem(tb) 23 | 24 | 25 | # sys.excepthook = debug_on_exception 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--local_rank", type=int, default=0) 29 | parser.add_argument("--model_name_or_path", type=str, required=True) 30 | parser.add_argument("--ckpt_path", type=str, required=True) 31 | parser.add_argument("--use_lora", action="store_true") 32 | parser.add_argument("--use_deepspeed", action="store_true") 33 | parser.add_argument("--llama", action="store_true") 34 | parser.add_argument("--base_port", default=17860, type=int) 35 | args = parser.parse_args() 36 | 37 | 38 | def generate_prompt(input_text): 39 | return input_text 40 | 41 | 42 | def evaluate( 43 | model, 44 | tokenizer, 45 | input: str, 46 | temperature=0.1, # 温度参数,用于控制生成文本的随机性。较高的温度值会导致更随机的输出,而较低的温度值会导致更确定性的输出 47 | top_p=0.75, # 在生成文本时,保留的可能性最高的tokens的累积概率 48 | top_k=40, # 仅考虑前top_k个可能的token来生成文本 49 | num_beams=4, # 束搜索(beam search)的数量。束搜索是一种用于生成文本的搜索策略,它可以帮助生成更流畅和一致的文本 50 | do_sample=False, # 表示是否使用采样方法生成文本。True,则会进行随机采样,否则将使用贪婪(greedy)解码 51 | max_new_tokens=128, # 生成的新tokens的最大数量。这个参数用于控制生成文本的长度 52 | min_new_tokens=1, # 生成的新tokens的最小数量。这个参数用于确保生成的文本不会太短 53 | repetition_penalty=1.2, # 重复惩罚参数,用于控制生成文本中重复tokens的惩罚程度 54 | **kwargs, 55 | ): 56 | prompt = generate_prompt(input) 57 | inputs = tokenizer(prompt, return_tensors="pt") 58 | 59 | input_ids = inputs["input_ids"].to(getattr(model, 'module', model).device) 60 | 61 | generation_config = GenerationConfig( 62 | temperature=temperature, 63 | top_p=top_p, 64 | top_k=top_k, 65 | num_beams=num_beams, 66 | bos_token_id=tokenizer.bos_token_id, 67 | eos_token_id=tokenizer.eos_token_id, 68 | pad_token_id=tokenizer.pad_token_id, 69 | max_new_tokens=max_new_tokens, # max_length=max_new_tokens+input_sequence 70 | min_new_tokens=min_new_tokens, # min_length=min_new_tokens+input_sequence 71 | repetition_penalty=repetition_penalty, 72 | do_sample=do_sample, 73 | **kwargs, 74 | ) 75 | with torch.no_grad(): 76 | # pudb.set_trace() 77 | generation_output = model.generate( 78 | input_ids=input_ids, 79 | generation_config=generation_config, 80 | return_dict_in_generate=True, 81 | output_scores=False 82 | ) 83 | output = generation_output.sequences[0] 84 | output = ( 85 | tokenizer.decode(output, skip_special_tokens=True) 86 | .strip() 87 | )[len(input):] 88 | return output 89 | 90 | 91 | if __name__ == "__main__": 92 | load_type = torch.float16 # Sometimes may need torch.float32 93 | 94 | if args.llama: 95 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path) 96 | else: 97 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 98 | 99 | tokenizer.pad_token_id = 0 # 将分词器的填充token的ID设置为0 100 | tokenizer.bos_token_id = 1 # 将分词器的开始token的ID设置为1 101 | tokenizer.eos_token_id = 2 # 将分词器的结束token的ID设置为2 102 | tokenizer.padding_side = "left" # 设置填充token位于左侧 103 | model_config = AutoConfig.from_pretrained(args.model_name_or_path) 104 | 105 | print(f"Rank {args.local_rank} loading model...") # "rank" 在计算机科学领域通常用于表示并行计算中的处理单元的标识或位置 106 | 107 | if args.use_lora: 108 | base_model = AutoModelForCausalLM.from_pretrained( 109 | args.model_name_or_path, torch_dtype=load_type, config=model_config 110 | ) 111 | model = PeftModel.from_pretrained( 112 | base_model, args.ckpt_path, torch_dtype=load_type 113 | ) 114 | else: 115 | model = AutoModelForCausalLM.from_pretrained( 116 | args.ckpt_path, torch_dtype=load_type, config=model_config 117 | ) 118 | 119 | if not args.use_deepspeed: 120 | if torch.cuda.is_available(): 121 | device = torch.device(f'cuda') 122 | else: 123 | device = torch.device('cpu') 124 | if device == torch.device('cpu'): 125 | model.float() 126 | print(f'device: {device}') 127 | model.to(device) 128 | model.eval() 129 | else: 130 | model = deepspeed.init_inference( 131 | model, 132 | mp_size=int(os.getenv("WORLD_SIZE", "1")), 133 | dtype=torch.half, 134 | checkpoint=None, 135 | replace_with_kernel_inject=True, 136 | ) 137 | 138 | # model = None 139 | 140 | print("Load model successfully") 141 | # https://gradio.app/docs/ # Gradio是一个用于构建机器学习模型交互界面的Python库 142 | gr.Interface( 143 | fn=partial(evaluate, model, tokenizer), 144 | inputs=[ 145 | gr.components.Textbox( 146 | lines=2, label="Input", placeholder="Welcome to the BELLE model" 147 | ), 148 | gr.components.Slider(minimum=0, maximum=1, 149 | value=0.1, label="Temperature"), 150 | gr.components.Slider(minimum=0, maximum=1, 151 | value=0.75, label="Top p"), 152 | gr.components.Slider( 153 | minimum=0, maximum=100, step=1, value=40, label="Top k" 154 | ), 155 | gr.components.Slider( 156 | minimum=1, maximum=4, step=1, value=1, label="Beams Number" 157 | ), 158 | gr.components.Checkbox( 159 | value=False, 160 | label="Do sample" 161 | ), 162 | gr.components.Slider( 163 | minimum=1, maximum=2000, step=10, value=512, label="Max New Tokens" 164 | ), 165 | gr.components.Slider( 166 | minimum=1, maximum=300, step=10, value=1, label="Min New Tokens" 167 | ), 168 | gr.components.Slider( 169 | minimum=1.0, 170 | maximum=2.0, 171 | step=0.1, 172 | value=1.2, 173 | label="Repetition Penalty", 174 | ) 175 | ], 176 | outputs=[ 177 | gr.inputs.Textbox( 178 | lines=25, 179 | label="Output", 180 | ) 181 | ], 182 | title="BELLE: Be Everyone's Large Language model Engine", 183 | ).queue().launch( 184 | share=True, server_name="0.0.0.0", server_port=args.base_port + args.local_rank 185 | ) 186 | -------------------------------------------------------------------------------- /train/src/apply_delta.py: -------------------------------------------------------------------------------- 1 | #!python 2 | # -*- coding: utf-8 -*- 3 | # @author: Kun 4 | 5 | import argparse 6 | import gc 7 | import glob 8 | import json 9 | import os 10 | import shutil 11 | import tempfile 12 | 13 | from huggingface_hub import snapshot_download 14 | import torch 15 | from torch import nn 16 | from tqdm import tqdm 17 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 18 | from transformers import LlamaTokenizer, LlamaConfig, LlamaForCausalLM 19 | 20 | """ 21 | Apply the delta weights on top of a base model. 22 | 23 | Usage: 24 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1 25 | """ 26 | PROXIES={ 27 | 'http': os.environ.get('http_proxy', ''), 28 | 'https': os.environ.get('https_proxy', '') 29 | } 30 | GB = 1 << 30 31 | 32 | def split_files(model_path, tmp_path, split_size): 33 | if not os.path.exists(model_path): 34 | model_path = snapshot_download(repo_id=model_path) 35 | if not os.path.exists(tmp_path): 36 | os.makedirs(tmp_path) 37 | 38 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin") 39 | files = glob.glob(file_pattern) 40 | 41 | part = 0 42 | for file_path in tqdm(files): 43 | state_dict = torch.load(file_path) 44 | new_state_dict = {} 45 | 46 | current_size = 0 47 | try: 48 | for name, param in state_dict.items(): 49 | param_size = param.numel() * param.element_size() 50 | 51 | if current_size + param_size > split_size: 52 | new_file_name = f"pytorch_model-{part}.bin" 53 | new_file_path = os.path.join(tmp_path, new_file_name) 54 | torch.save(new_state_dict, new_file_path) 55 | current_size = 0 56 | new_state_dict = None 57 | gc.collect() 58 | new_state_dict = {} 59 | part += 1 60 | 61 | new_state_dict[name] = param 62 | current_size += param_size 63 | 64 | new_file_name = f"pytorch_model-{part}.bin" 65 | new_file_path = os.path.join(tmp_path, new_file_name) 66 | torch.save(new_state_dict, new_file_path) 67 | new_state_dict = None 68 | gc.collect() 69 | new_state_dict = {} 70 | part += 1 71 | except Exception as e: 72 | print(f"An error occurred during split_files: {e}") 73 | shutil.rmtree(tmp_path) 74 | raise 75 | 76 | 77 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path): 78 | # base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False) 79 | # base_config = AutoConfig.from_pretrained(base_model_path) 80 | 81 | base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False, proxies=PROXIES) 82 | base_config = LlamaConfig.from_pretrained(base_model_path, proxies=PROXIES) 83 | 84 | if os.path.exists(target_model_path): 85 | shutil.rmtree(target_model_path) 86 | os.makedirs(target_model_path) 87 | 88 | split_size = 4 * GB 89 | 90 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path: 91 | print(f"Split files for the base model to {tmp_base_path}") 92 | split_files(base_model_path, tmp_base_path, split_size) 93 | print(f"Split files for the delta model to {tmp_delta_path}") 94 | split_files(delta_path, tmp_delta_path, split_size) 95 | 96 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin") 97 | base_files = glob.glob(base_pattern) 98 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin") 99 | delta_files = glob.glob(delta_pattern) 100 | delta_state_dict = torch.load(delta_files[0]) 101 | 102 | print("Applying the delta") 103 | weight_map = {} 104 | total_size = 0 105 | 106 | for i, base_file in tqdm(enumerate(base_files)): 107 | state_dict = torch.load(base_file) 108 | file_name = f"pytorch_model-{i}.bin" 109 | for name, param in state_dict.items(): 110 | if name not in delta_state_dict: 111 | for delta_file in delta_files: 112 | delta_state_dict = torch.load(delta_file) 113 | gc.collect() 114 | if name in delta_state_dict: 115 | break 116 | 117 | state_dict[name] += delta_state_dict[name] 118 | weight_map[name] = file_name 119 | total_size += param.numel() * param.element_size() 120 | gc.collect() 121 | torch.save(state_dict, os.path.join(target_model_path, file_name)) 122 | 123 | with open( 124 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w" 125 | ) as f: 126 | json.dump( 127 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f 128 | ) 129 | 130 | print(f"Saving the target model to {target_model_path}") 131 | base_tokenizer.save_pretrained(target_model_path) 132 | base_config.save_pretrained(target_model_path) 133 | 134 | 135 | def apply_delta(base_model_path, target_model_path, delta_path): 136 | print(f"Loading the base model from {base_model_path}") 137 | # base = AutoModelForCausalLM.from_pretrained( 138 | # base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 139 | # ) 140 | # base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False) 141 | base = LlamaForCausalLM.from_pretrained( 142 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, proxies=PROXIES 143 | ) 144 | base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False, proxies=PROXIES) 145 | 146 | print(f"Loading the delta from {delta_path}") 147 | delta = AutoModelForCausalLM.from_pretrained( 148 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, proxies=PROXIES 149 | ) 150 | 151 | print("Applying the delta") 152 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): 153 | assert name in delta.state_dict() 154 | param.data += delta.state_dict()[name] 155 | 156 | print(f"Saving the target model to {target_model_path}") 157 | base.save_pretrained(target_model_path) 158 | base_tokenizer.save_pretrained(target_model_path) 159 | 160 | 161 | if __name__ == "__main__": 162 | parser = argparse.ArgumentParser() 163 | parser.add_argument("--base-model-path", type=str, required=True) 164 | parser.add_argument("--target-model-path", type=str, required=True) 165 | parser.add_argument("--delta-path", type=str, required=True) 166 | parser.add_argument( 167 | "--low-cpu-mem", 168 | action="store_true", 169 | help="Lower the cpu memory usage. This will split large files and use " 170 | "disk as swap to reduce the memory usage below 10GB.", 171 | ) 172 | args = parser.parse_args() 173 | 174 | if args.low_cpu_mem: 175 | apply_delta_low_cpu_mem( 176 | args.base_model_path, args.target_model_path, args.delta_path 177 | ) 178 | else: 179 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 180 | -------------------------------------------------------------------------------- /train/src/trainer.py: -------------------------------------------------------------------------------- 1 | from peft import PeftModel 2 | from transformers.trainer import * 3 | 4 | from src.utils import get_ds_state_dict 5 | 6 | 7 | class MyTrainer(Trainer): 8 | def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): 9 | """ 10 | Add supports for peft + deepspeed zero 3 11 | 12 | Will save the model, so you can reload it using `from_pretrained()`. 13 | 14 | Will only save from the main process. 15 | """ 16 | 17 | if output_dir is None: 18 | output_dir = self.args.output_dir 19 | 20 | if is_torch_tpu_available(): 21 | self._save_tpu(output_dir) 22 | elif is_sagemaker_mp_enabled(): 23 | # Calling the state_dict needs to be done on the wrapped model and on all processes. 24 | os.makedirs(output_dir, exist_ok=True) 25 | state_dict = self.model_wrapped.state_dict() 26 | if self.args.should_save: 27 | self._save(output_dir, state_dict=state_dict) 28 | if IS_SAGEMAKER_MP_POST_1_10: 29 | # 'user_content.pt' indicates model state_dict saved with smp >= 1.10 30 | Path(os.path.join(output_dir, "user_content.pt")).touch() 31 | elif ( 32 | ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp 33 | or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp 34 | or self.fsdp is not None 35 | or self.is_fsdp_enabled 36 | ): 37 | if self.is_fsdp_enabled: 38 | os.makedirs(output_dir, exist_ok=True) 39 | self.accelerator.state.fsdp_plugin.save_model(self.accelerator, self.model, output_dir) 40 | else: 41 | state_dict = self.model.state_dict() 42 | 43 | if self.args.should_save: 44 | self._save(output_dir, state_dict=state_dict) 45 | elif self.is_deepspeed_enabled: 46 | # This must be called on all ranks in stage 3 47 | if is_deepspeed_zero3_enabled(): 48 | state_dict = get_ds_state_dict(self.deepspeed) 49 | else: 50 | # Only run on rank 0 except stage 3 51 | if self.args.should_save: 52 | state_dict = get_ds_state_dict(self.deepspeed) 53 | # this takes care of everything as long as we aren't under zero3 54 | # Only run on rank 0 55 | if self.args.should_save: 56 | # state_dict is available on rank 0 57 | self._save(output_dir, state_dict=state_dict) 58 | 59 | elif self.args.should_save: 60 | self._save(output_dir) 61 | 62 | # Push to the Hub when `save_model` is called by the user. 63 | if self.args.push_to_hub and not _internal_call: 64 | self.push_to_hub(commit_message="Model save") 65 | 66 | def _load_from_checkpoint(self, resume_from_checkpoint, model=None): 67 | """ 68 | Add supports for peft resume 69 | """ 70 | if model is None: 71 | model = self.model 72 | 73 | config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME) 74 | 75 | weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME) 76 | weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME) 77 | adapter_model_path = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME) 78 | safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME) 79 | safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME) 80 | safe_adapter_model_path = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME) 81 | 82 | if not any( 83 | os.path.isfile(f) for f in [weights_file, safe_weights_file, adapter_model_path, weights_index_file, safe_weights_index_file, safe_adapter_model_path] 84 | ): 85 | raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") 86 | 87 | logger.info(f"Loading model from {resume_from_checkpoint}.") 88 | 89 | if os.path.isfile(config_file): 90 | config = PretrainedConfig.from_json_file(config_file) 91 | checkpoint_version = config.transformers_version 92 | if checkpoint_version is not None and checkpoint_version != __version__: 93 | logger.warning( 94 | f"You are resuming training from a checkpoint trained with {checkpoint_version} of " 95 | f"Transformers but your current version is {__version__}. This is not recommended and could " 96 | "yield to errors or unwanted behaviors." 97 | ) 98 | 99 | if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or \ 100 | os.path.isfile(adapter_model_path) or os.path.isfile(safe_adapter_model_path): 101 | # If the model is on the GPU, it still works! 102 | if is_sagemaker_mp_enabled(): 103 | if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")): 104 | # If the 'user_content.pt' file exists, load with the new smp api. 105 | # Checkpoint must have been saved with the new smp api. 106 | smp.resume_from_checkpoint( 107 | path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False 108 | ) 109 | else: 110 | # If the 'user_content.pt' file does NOT exist, load with the old smp api. 111 | # Checkpoint must have been saved with the old smp api. 112 | if hasattr(self.args, "fp16") and self.args.fp16 is True: 113 | logger.warning( 114 | "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported." 115 | ) 116 | state_dict = torch.load(weights_file, map_location="cpu") 117 | # Required for smp to not auto-translate state_dict from hf to smp (is already smp). 118 | state_dict["_smp_is_partial"] = False 119 | load_result = model.load_state_dict(state_dict, strict=True) 120 | # release memory 121 | del state_dict 122 | elif self.is_fsdp_enabled: 123 | self.accelerator.state.fsdp_plugin.load_model(self.accelerator, model, resume_from_checkpoint) 124 | else: 125 | if is_peft_available() and isinstance(model, PeftModel): 126 | model.load_adapter(resume_from_checkpoint, getattr(model, "active_adapter", "default"), is_trainable=True) 127 | else: 128 | # We load the model state dict on the CPU to avoid an OOM error. 129 | if self.args.save_safetensors and os.path.isfile(safe_weights_file): 130 | state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu") 131 | else: 132 | state_dict = torch.load(weights_file, map_location="cpu") 133 | 134 | # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963 135 | # which takes *args instead of **kwargs 136 | load_result = model.load_state_dict(state_dict, False) 137 | # release memory 138 | del state_dict 139 | self._issue_warnings_after_load(load_result) 140 | else: 141 | # We load the sharded checkpoint 142 | load_result = load_sharded_checkpoint( 143 | model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors 144 | ) 145 | if not is_sagemaker_mp_enabled(): 146 | self._issue_warnings_after_load(load_result) 147 | -------------------------------------------------------------------------------- /train/scripts/convert_llama_weights_to_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | import gc 16 | import json 17 | import math 18 | import os 19 | import shutil 20 | 21 | import torch 22 | 23 | from transformers import LlamaConfig, LlamaForCausalLM 24 | 25 | 26 | """ 27 | Sample usage: 28 | 29 | ``` 30 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \ 31 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path 32 | ``` 33 | 34 | Thereafter, models can be loaded via: 35 | 36 | ```py 37 | from transformers import LlamaForCausalLM, LlamaForTokenizer 38 | 39 | model = LlamaForCausalLM.from_pretrained("/output/path") 40 | tokenizer = LlamaTokenizer.from_pretrained("/output/path") 41 | ``` 42 | 43 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions 44 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). 45 | """ 46 | 47 | INTERMEDIATE_SIZE_MAP = { 48 | "7B": 11008, 49 | "13B": 13824, 50 | "33B": 17920, 51 | "65B": 22016, 52 | } 53 | NUM_SHARDS = { 54 | "7B": 1, 55 | "13B": 2, 56 | "33B": 4, 57 | "65B": 8, 58 | } 59 | 60 | 61 | def compute_intermediate_size(n): 62 | return int(math.ceil(n * 8 / 3) + 255) // 256 * 256 63 | 64 | 65 | def read_json(path): 66 | with open(path, "r") as f: 67 | return json.load(f) 68 | 69 | 70 | def write_json(text, path): 71 | with open(path, "w") as f: 72 | json.dump(text, f) 73 | 74 | 75 | def write_model(model_path, input_base_path, model_size): 76 | os.makedirs(model_path, exist_ok=True) 77 | tmp_model_path = os.path.join(model_path, "tmp") 78 | os.makedirs(tmp_model_path, exist_ok=True) 79 | 80 | params = read_json(os.path.join(input_base_path, "params.json")) 81 | num_shards = NUM_SHARDS[model_size] 82 | n_layers = params["n_layers"] 83 | n_heads = params["n_heads"] 84 | n_heads_per_shard = n_heads // num_shards 85 | dim = params["dim"] 86 | dims_per_head = dim // n_heads 87 | base = 10000.0 88 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) 89 | 90 | # permute for sliced rotary 91 | def permute(w): 92 | return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim) 93 | 94 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.") 95 | # Load weights 96 | if model_size == "7B": 97 | # Not shared 98 | # (The sharded implementation would also work, but this is simpler.) 99 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") 100 | else: 101 | # Sharded 102 | loaded = [ 103 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") 104 | for i in range(num_shards) 105 | ] 106 | param_count = 0 107 | index_dict = {"weight_map": {}} 108 | for layer_i in range(n_layers): 109 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" 110 | if model_size == "7B": 111 | # Unsharded 112 | state_dict = { 113 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( 114 | loaded[f"layers.{layer_i}.attention.wq.weight"] 115 | ), 116 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( 117 | loaded[f"layers.{layer_i}.attention.wk.weight"] 118 | ), 119 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], 120 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], 121 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], 122 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], 123 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], 124 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], 125 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], 126 | } 127 | else: 128 | # Sharded 129 | # Note that in the 13B checkpoint, not cloning the two following weights will result in the checkpoint 130 | # becoming 37GB instead of 26GB for some reason. 131 | state_dict = { 132 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ 133 | f"layers.{layer_i}.attention_norm.weight" 134 | ].clone(), 135 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ 136 | f"layers.{layer_i}.ffn_norm.weight" 137 | ].clone(), 138 | } 139 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( 140 | torch.cat( 141 | [ 142 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) 143 | for i in range(num_shards) 144 | ], 145 | dim=0, 146 | ).reshape(dim, dim) 147 | ) 148 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( 149 | torch.cat( 150 | [ 151 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim) 152 | for i in range(num_shards) 153 | ], 154 | dim=0, 155 | ).reshape(dim, dim) 156 | ) 157 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( 158 | [ 159 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim) 160 | for i in range(num_shards) 161 | ], 162 | dim=0, 163 | ).reshape(dim, dim) 164 | 165 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( 166 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 167 | ) 168 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( 169 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 170 | ) 171 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( 172 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 173 | ) 174 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( 175 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 176 | ) 177 | 178 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq 179 | for k, v in state_dict.items(): 180 | index_dict["weight_map"][k] = filename 181 | param_count += v.numel() 182 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 183 | 184 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" 185 | if model_size == "7B": 186 | # Unsharded 187 | state_dict = { 188 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"], 189 | "model.norm.weight": loaded["norm.weight"], 190 | "lm_head.weight": loaded["output.weight"], 191 | } 192 | else: 193 | state_dict = { 194 | "model.norm.weight": loaded[0]["norm.weight"], 195 | "model.embed_tokens.weight": torch.cat( 196 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 197 | ), 198 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), 199 | } 200 | 201 | for k, v in state_dict.items(): 202 | index_dict["weight_map"][k] = filename 203 | param_count += v.numel() 204 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 205 | 206 | # Write configs 207 | index_dict["metadata"] = {"total_size": param_count * 2} 208 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) 209 | 210 | config = LlamaConfig( 211 | hidden_size=dim, 212 | intermediate_size=compute_intermediate_size(dim), 213 | num_attention_heads=params["n_heads"], 214 | num_hidden_layers=params["n_layers"], 215 | rms_norm_eps=params["norm_eps"], 216 | ) 217 | config.save_pretrained(tmp_model_path) 218 | 219 | # Make space so we can load the model properly now. 220 | del state_dict 221 | del loaded 222 | gc.collect() 223 | 224 | print("Loading the checkpoint in a Llama model.") 225 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 226 | # Avoid saving this as part of the config. 227 | del model.config._name_or_path 228 | 229 | print("Saving in the Transformers format.") 230 | model.save_pretrained(model_path) 231 | shutil.rmtree(tmp_model_path) 232 | 233 | 234 | def write_tokenizer(tokenizer_path, input_tokenizer_path): 235 | print(f"Fetching the tokenizer from {input_tokenizer_path}.") 236 | os.makedirs(tokenizer_path, exist_ok=True) 237 | write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json")) 238 | write_json( 239 | { 240 | "bos_token": "", 241 | "eos_token": "", 242 | "model_max_length": int(1e30), 243 | "tokenizer_class": "LlamaTokenizer", 244 | "unk_token": "", 245 | }, 246 | os.path.join(tokenizer_path, "tokenizer_config.json"), 247 | ) 248 | shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model")) 249 | 250 | 251 | def main(): 252 | parser = argparse.ArgumentParser() 253 | parser.add_argument( 254 | "--input_dir", 255 | help="Location of LLaMA weights, which contains tokenizer.model and model folders", 256 | ) 257 | parser.add_argument( 258 | "--model_size", 259 | choices=["7B", "13B", "33B", "65B", "tokenizer_only"], 260 | ) 261 | parser.add_argument( 262 | "--output_dir", 263 | help="Location to write HF model and tokenizer", 264 | ) 265 | args = parser.parse_args() 266 | if args.model_size != "tokenizer_only": 267 | write_model( 268 | model_path=args.output_dir, 269 | input_base_path=os.path.join(args.input_dir, args.model_size), 270 | model_size=args.model_size, 271 | ) 272 | write_tokenizer( 273 | tokenizer_path=args.output_dir, 274 | input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"), 275 | ) 276 | 277 | 278 | if __name__ == "__main__": 279 | main() -------------------------------------------------------------------------------- /train/dschat_train_v1/training_scripts/convert_llama_weights_to_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | import gc 16 | import json 17 | import math 18 | import os 19 | import shutil 20 | 21 | import torch 22 | 23 | from transformers import LlamaConfig, LlamaForCausalLM 24 | 25 | 26 | """ 27 | Sample usage: 28 | 29 | ``` 30 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \ 31 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path 32 | ``` 33 | 34 | Thereafter, models can be loaded via: 35 | 36 | ```py 37 | from transformers import LlamaForCausalLM, LlamaForTokenizer 38 | 39 | model = LlamaForCausalLM.from_pretrained("/output/path") 40 | tokenizer = LlamaTokenizer.from_pretrained("/output/path") 41 | ``` 42 | 43 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions 44 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). 45 | """ 46 | 47 | INTERMEDIATE_SIZE_MAP = { 48 | "7B": 11008, 49 | "13B": 13824, 50 | "33B": 17920, 51 | "65B": 22016, 52 | } 53 | NUM_SHARDS = { 54 | "7B": 1, 55 | "13B": 2, 56 | "33B": 4, 57 | "65B": 8, 58 | } 59 | 60 | 61 | def compute_intermediate_size(n): 62 | return int(math.ceil(n * 8 / 3) + 255) // 256 * 256 63 | 64 | 65 | def read_json(path): 66 | with open(path, "r") as f: 67 | return json.load(f) 68 | 69 | 70 | def write_json(text, path): 71 | with open(path, "w") as f: 72 | json.dump(text, f) 73 | 74 | 75 | def write_model(model_path, input_base_path, model_size): 76 | os.makedirs(model_path, exist_ok=True) 77 | tmp_model_path = os.path.join(model_path, "tmp") 78 | os.makedirs(tmp_model_path, exist_ok=True) 79 | 80 | params = read_json(os.path.join(input_base_path, "params.json")) 81 | num_shards = NUM_SHARDS[model_size] 82 | n_layers = params["n_layers"] 83 | n_heads = params["n_heads"] 84 | n_heads_per_shard = n_heads // num_shards 85 | dim = params["dim"] 86 | dims_per_head = dim // n_heads 87 | base = 10000.0 88 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) 89 | 90 | # permute for sliced rotary 91 | def permute(w): 92 | return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim) 93 | 94 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.") 95 | # Load weights 96 | if model_size == "7B": 97 | # Not shared 98 | # (The sharded implementation would also work, but this is simpler.) 99 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") 100 | else: 101 | # Sharded 102 | loaded = [ 103 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") 104 | for i in range(num_shards) 105 | ] 106 | param_count = 0 107 | index_dict = {"weight_map": {}} 108 | for layer_i in range(n_layers): 109 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" 110 | if model_size == "7B": 111 | # Unsharded 112 | state_dict = { 113 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( 114 | loaded[f"layers.{layer_i}.attention.wq.weight"] 115 | ), 116 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( 117 | loaded[f"layers.{layer_i}.attention.wk.weight"] 118 | ), 119 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], 120 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], 121 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], 122 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], 123 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], 124 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], 125 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], 126 | } 127 | else: 128 | # Sharded 129 | # Note that in the 13B checkpoint, not cloning the two following weights will result in the checkpoint 130 | # becoming 37GB instead of 26GB for some reason. 131 | state_dict = { 132 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ 133 | f"layers.{layer_i}.attention_norm.weight" 134 | ].clone(), 135 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ 136 | f"layers.{layer_i}.ffn_norm.weight" 137 | ].clone(), 138 | } 139 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( 140 | torch.cat( 141 | [ 142 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) 143 | for i in range(num_shards) 144 | ], 145 | dim=0, 146 | ).reshape(dim, dim) 147 | ) 148 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( 149 | torch.cat( 150 | [ 151 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim) 152 | for i in range(num_shards) 153 | ], 154 | dim=0, 155 | ).reshape(dim, dim) 156 | ) 157 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( 158 | [ 159 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim) 160 | for i in range(num_shards) 161 | ], 162 | dim=0, 163 | ).reshape(dim, dim) 164 | 165 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( 166 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 167 | ) 168 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( 169 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 170 | ) 171 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( 172 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 173 | ) 174 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( 175 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 176 | ) 177 | 178 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq 179 | for k, v in state_dict.items(): 180 | index_dict["weight_map"][k] = filename 181 | param_count += v.numel() 182 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 183 | 184 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" 185 | if model_size == "7B": 186 | # Unsharded 187 | state_dict = { 188 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"], 189 | "model.norm.weight": loaded["norm.weight"], 190 | "lm_head.weight": loaded["output.weight"], 191 | } 192 | else: 193 | state_dict = { 194 | "model.norm.weight": loaded[0]["norm.weight"], 195 | "model.embed_tokens.weight": torch.cat( 196 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 197 | ), 198 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), 199 | } 200 | 201 | for k, v in state_dict.items(): 202 | index_dict["weight_map"][k] = filename 203 | param_count += v.numel() 204 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 205 | 206 | # Write configs 207 | index_dict["metadata"] = {"total_size": param_count * 2} 208 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) 209 | 210 | config = LlamaConfig( 211 | hidden_size=dim, 212 | intermediate_size=compute_intermediate_size(dim), 213 | num_attention_heads=params["n_heads"], 214 | num_hidden_layers=params["n_layers"], 215 | rms_norm_eps=params["norm_eps"], 216 | ) 217 | config.save_pretrained(tmp_model_path) 218 | 219 | # Make space so we can load the model properly now. 220 | del state_dict 221 | del loaded 222 | gc.collect() 223 | 224 | print("Loading the checkpoint in a Llama model.") 225 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 226 | # Avoid saving this as part of the config. 227 | del model.config._name_or_path 228 | 229 | print("Saving in the Transformers format.") 230 | model.save_pretrained(model_path) 231 | shutil.rmtree(tmp_model_path) 232 | 233 | 234 | def write_tokenizer(tokenizer_path, input_tokenizer_path): 235 | print(f"Fetching the tokenizer from {input_tokenizer_path}.") 236 | os.makedirs(tokenizer_path, exist_ok=True) 237 | write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json")) 238 | write_json( 239 | { 240 | "bos_token": "", 241 | "eos_token": "", 242 | "model_max_length": int(1e30), 243 | "tokenizer_class": "LlamaTokenizer", 244 | "unk_token": "", 245 | }, 246 | os.path.join(tokenizer_path, "tokenizer_config.json"), 247 | ) 248 | shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model")) 249 | 250 | 251 | def main(): 252 | parser = argparse.ArgumentParser() 253 | parser.add_argument( 254 | "--input_dir", 255 | help="Location of LLaMA weights, which contains tokenizer.model and model folders", 256 | ) 257 | parser.add_argument( 258 | "--model_size", 259 | choices=["7B", "13B", "33B", "65B", "tokenizer_only"], 260 | ) 261 | parser.add_argument( 262 | "--output_dir", 263 | help="Location to write HF model and tokenizer", 264 | ) 265 | args = parser.parse_args() 266 | if args.model_size != "tokenizer_only": 267 | write_model( 268 | model_path=args.output_dir, 269 | input_base_path=os.path.join(args.input_dir, args.model_size), 270 | model_size=args.model_size, 271 | ) 272 | write_tokenizer( 273 | tokenizer_path=args.output_dir, 274 | input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"), 275 | ) 276 | 277 | 278 | if __name__ == "__main__": 279 | main() -------------------------------------------------------------------------------- /train/dschat_train_v1/README.md: -------------------------------------------------------------------------------- 1 | # 项目介绍 2 | 3 | 本目录代码基于[Deepspeed-Chat](https://github.com/microsoft/DeepSpeedExamples)项目,可用于微调大语言模型,包括全量参数微调(fine-tuning)和基于LoRA的参数高效微调。 4 | 5 | ## 1. 准备环境 6 | 7 | 我们提供了一个完整可运行的Docker环境,Dockerfile写在docker文件夹下. 8 | 9 | 考虑到build存在一定的困难,我们提供了镜像下载,你可以使用下面命令从dockerhub拉取我们的镜像,然后在镜像中运行代码。 10 | 11 | ```shell 12 | docker pull belleagi/belle:v1.0 13 | git clone https://github.com/LianjiaTech/BELLE.git 14 | docker run -it --runtime=nvidia --shm-size="40g" -v /path/BELLE/train:/workspace/BELLE-train -v /path/huggingface_models/:/workspace/model_name_or_path belleagi/belle:v1.0 /bin/bash 15 | ``` 16 | 17 | 上述命令实现了以下几点: 18 | 19 | 1. 拉取docker镜像 20 | 2. clone BELLE仓库 21 | 3. 将BELLE/train目录与Docker环境中/workspace/BELLE-train目录挂载 22 | 4. 将huggingface_models目录与Docker环境中/workspace/model_name_or_path目录挂载。其中huggingface_models代表预训练模型的保存路径,该目录下存放所有需要的预训练语言模型,如llama-7b, bloomz-7b1-mt等 23 | 5. 注意:上述挂载的目录必须是绝对路径 24 | 25 | ## 2. Run 26 | 27 | ### 2.1 Download data 28 | 29 | ```bash 30 | wget https://huggingface.co/datasets/BelleGroup/school_math_0.25M/resolve/main/school_math_0.25M.json 31 | ``` 32 | 33 | 该数据是数学应用题求解任务,包含解题过程,共计约25万条。示例: 34 | 35 | | Instruction | Answer | 36 | | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 37 | | 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天? | 一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可: 25 ÷ 9 = 2余7 也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。 因此,一箱鸡蛋能够供小明吃2天。 | 38 | | 小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果? | 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。 2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 计算过程如下: 15 - 6 = 9 所以,小明还剩下9颗糖果。 | 39 | 40 | #### 2.1.1 Prepare data 41 | 42 | ```bash 43 | python training_scripts/convert_to_conv_data.py --orig_data school_math_0.25M.json --write_data school_math_0.25M_conv.json --dataset_name bellemath 44 | head -n 1000 school_math_0.25M_conv.json > belleMath-dev1K.json 45 | tail -n +1001 school_math_0.25M_conv.json > belleMath.json 46 | ``` 47 | 48 | 我们选取前1000条作为验证集,其余数据作为训练集 49 | 50 | 我们会在Instruction的开头和结尾加上Human和Assistant作为模型的输入,形如: 51 | 52 | | Instruction | 53 | | -------------------------------------------------------------------------------------------------------------------- | 54 | | Human: 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天?\n\nAssistant: | 55 | | Human: 小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果?\n\nAssistant: | 56 | 57 | **注:**目前仓库的代码要兼容单轮和多轮数据,因此对于之前的开源数据(格式为(instruction,output)或者(input,target)。目前绝大部分开源的单轮的instruction数据均为这种格式),需要用convert_to_conv_data.py脚本转换为如下的统一格式: 58 | 59 | ```bash 60 | {"id": "bellemath42", "conversations": [{"from": "human", "value": " 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天?"}, {"from": "assistant", "value": "\n一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可:\n25 ÷ 9 = 2余7\n也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。\n因此,一箱鸡蛋能够供小明吃2天。"}]} 61 | {"id": "bellemath80", "conversations": [{"from": "human", "value": "小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果?"}, {"from": "assistant", "value": " \n1. 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。\n2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 \n计算过程如下:\n15 - 6 = 9\n所以,小明还剩下9颗糖果。"}]} 62 | ``` 63 | 64 | 其他的训练数据见:https://huggingface.co/BelleGroup 按照上述流程转换格式即可。 65 | 66 | #### 2.1.2 Multi-turn data 67 | 68 | [shareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main) 是一个开源的大规模的多轮对话数据,具体效果可参考我们的工作:[Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://arxiv.org/pdf/2304.07854.pdf) 69 | 70 | 当前代码已支持训练这种多轮对话数据。数据下载: 71 | 72 | ```bash 73 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 74 | mv ShareGPT_V3_unfiltered_cleaned_split.json shareGPT.json 75 | ``` 76 | 77 | ### 2.2 Train 78 | 79 | 目前支持单机单卡和单机多卡的训练。不同于 [Deepspeed-Chat ](https://github.com/microsoft/DeepSpeedExamples),我们仅针对stage1,也就是SFT阶段(具体来说是instruction-tuning)。 80 | 81 | #### 2.2.1 单机多卡训练 82 | 83 | ##### Fine-Tuning 84 | 85 | 如果要实现单机多卡微调,仅需要运行如下命令 86 | 87 | ```bash 88 | bash training_scripts/single_node/run_FT.sh output 2 89 | ``` 90 | 91 | - output 代表数据和模型保存的路径,如果没有则会创建。 92 | - 2 代表zero_stage 93 | 94 | 具体启动命令和参数配置如下: 95 | 96 | ```bash 97 | deepspeed main.py \ 98 | --sft_only_data_path belleMath.json \ 99 | --eval_data_file belleMath-dev1K.json \ 100 | --model_name_or_path /workspace/model_name_or_path/hf_llama_7b \ 101 | --per_device_train_batch_size 1 \ 102 | --per_device_eval_batch_size 1 \ 103 | --max_seq_len 1024 \ 104 | --learning_rate 5e-7 \ 105 | --weight_decay 0.0001 \ 106 | --num_train_epochs 1 \ 107 | --gradient_accumulation_steps 4 \ 108 | --lr_scheduler_type cosine \ 109 | --num_warmup_steps 100 \ 110 | --seed 1234 \ 111 | --gradient_checkpointing \ 112 | --zero_stage $ZERO_STAGE \ 113 | --deepspeed \ 114 | --output_dir $OUTPUT \ 115 | --data_output_path $data_output_path \ 116 | ``` 117 | 118 | - sft_only_data_path 就是训练集数据。如果想换成shareGPT,仅需替换为shareGPT.json 119 | - eval_data_file 代表验证集数据,如果没有预先划分出训练和验证数据,可以不指定该参数,此时将会从训练数据中随机抽取1000条作为验证数据 120 | - model_name_or_path就是基础模型。我们建议基于我们开源的模型(如:[BelleGroup/BELLE-LLaMA-EXT-7B](https://huggingface.co/BelleGroup/BELLE-LLaMA-EXT-7B)) 作为基础模型进行进一步微调,这样仅需要少量训练数据和训练轮次即可微调一个效果不错的模型。 121 | - zero_stage。可优先设置为1或者2,如果显存不足,设置为3。关于zero-stage的详细介绍可参考: https://www.deepspeed.ai/tutorials/zero/ 122 | 123 | ##### LoRA 124 | 125 | 如果要实现单机多卡LoRA-based tuning,需要运行如下命令: 126 | 127 | ```bash 128 | bash training_scripts/single_node/run_LoRA.sh output-lora 2 129 | ``` 130 | 131 | - output 代表数据和模型保存的路径,如果没有则会创建。 132 | - 2 代表zero_stage 133 | 134 | 具体启动命令和参数配置如下: 135 | 136 | ```bash 137 | model_name_or_path=/workspace/model_name_or_path/hf_llama_7b 138 | lora_module_name="q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj" 139 | echo ${lora_module_name} 140 | 141 | deepspeed main.py \ 142 | --sft_only_data_path belleMath.json \ 143 | --eval_data_file belleMath-dev1K.json \ 144 | --data_split 10,0,0 \ 145 | --model_name_or_path ${model_name_or_path} \ 146 | --per_device_train_batch_size 16 \ 147 | --per_device_eval_batch_size 1 \ 148 | --max_seq_len 1024 \ 149 | --learning_rate 3e-4 \ 150 | --weight_decay 0. \ 151 | --num_train_epochs 1 \ 152 | --gradient_accumulation_steps 1 \ 153 | --lr_scheduler_type cosine \ 154 | --num_warmup_steps 100 \ 155 | --seed 1234 \ 156 | --gradient_checkpointing \ 157 | --zero_stage $ZERO_STAGE \ 158 | --lora_dim 16 \ 159 | --lora_alpha 16 \ 160 | --lora_droppout 0.05 \ 161 | --lora_module_name ${lora_module_name} \ 162 | --deepspeed \ 163 | --output_dir $OUTPUT_PATH \ 164 | ``` 165 | 166 | - lora_module_name代表LoRA需要adapt的参数,我们的实验设置是attention+MLP的参数。不同的预训练模型的权重名称不一样,比如对于Bloom模型,对应的attention权重的名称是query_key_value,此时lora_module_name可以改为"query_key_value,mlp" 167 | - lora_dim、lora_alpha、lora_droppout均为LoRA训练的超参数 168 | 169 | #### 2.2.2 单机单卡训练 170 | 171 | ##### Fine-Tuning 172 | 173 | 如果要实现单机单卡微调,仅需要运行如下命令 174 | 175 | ```bash 176 | bash training_scripts/single_gpu/run_FT.sh output 3 177 | ``` 178 | 179 | 其余配置与上述内容一致。 180 | 181 | ##### LoRA 182 | 183 | 如果要实现单机单卡LoRA-based tuning,需要运行如下命令: 184 | 185 | ```bash 186 | bash training_scripts/single_gpu/run_LoRA.sh output-lora 3 187 | ``` 188 | 189 | 其余配置与上述内容一致。 190 | 191 | 如果出现显存不足的情况,需要调整per_device_train_batch_size、max_seq_len、zero_stage三个参数。另外可参考[Deepspeed-Chat-training_scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts) 中各个启动脚本内的参数配置 192 | 193 | 其余参数说明详见:https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md 194 | 195 | **注:** 196 | 197 | - 如果是单轮instruction数据,比如 [BELLE-2M](https://huggingface.co/datasets/BelleGroup/train_2M_CN) 等。对于Bloom模型,建议max_seq_len设置为512-1024之间。而对于LLaMA模型,max_seq_len尽可能不要低于1024。避免切割出太多不完整的句子,不利于模型学习。 198 | - 如果是多轮对话数据,比如 [shareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main),max_seq_len建议设置为2048 199 | - 运行脚本中各个参数的设置需要按实际情况进行调整,下面给出一些参数值供参考: 200 | - 对于learning_rate,FT训练时设置为5e-6量级,LoRA训练时设置为3e-4量级 201 | - 对于epoch,FT训练时设置为2或3,LoRA训练时设置为4或5 202 | 203 | ### 2.3 Generate 204 | 205 | 现在我们加载训练好的模型,验证模型生成文本的效果。 206 | 207 | ```bash 208 | CUDA_VISIBLE_DEVICES=0 python prompt_eval.py \ 209 | --model_name_or_path model_name_or_path \ 210 | --finetuned_model_name_or_path finetuned_model_name_or_path \ 211 | --test_file test_file_name_or_path 212 | ``` 213 | 214 | 参数说明: 215 | 216 | - model_name_or_path 是原生预训练模型的路径 217 | - finetuned_model_name_or_path 是训练后保存的模型 218 | - test_file就是验证集数据,默认路径是belleMath-dev1K.json 219 | 220 | 举例: 221 | 222 | ```bash 223 | CUDA_VISIBLE_DEVICES=0 python prompt_eval.py \ 224 | --model_name_or_path /workspace/model_name_or_path/hf_llama_7b \ 225 | --finetuned_model_name_or_path output-lora \ 226 | --test_file belleMath-dev1K.json 227 | ``` 228 | 229 | 模型生成的数据保存在predictions.json文件中 230 | 231 | 这里给出一些预测的示例: 232 | 233 | ```bash 234 | {"instruction": "小明有12个橙子,他想把它们分给他的4个朋友,每人分到的橙子数量相同,每人能分到几个橙子?", "generate_text": "Human: 小明有12个橙子,他想把它们分给他的4个朋友,每人分到的橙子数量相同,每人能分到几个橙子?\nAssistant: 将12个橙子平均分成4份,可以用除法来解决。首先,我们需要计算出每一份应该包含多少个橙子:\n12÷4=3\n因此,每个朋友都会得到3个橙子。答案是每人能分到3个橙子。"} 235 | {"instruction": "以下是一道小学数学题:小明家里有 3 只宠物猫和 2 只宠物狗,小花家里有 4 只宠物猫和 1 只宠物狗,谁家里宠物更多?", "generate_text": "Human: 以下是一道小学数学题:小明家里有 3 只宠物猫和 2 只宠物狗,小花家里有 4 只宠物猫和 1 只宠物狗,谁家里宠物更多?\nAssistant: 首先,我们需要计算出两个人的宠物总数。小明家里共有 5 只宠物(3 + 2 = 5);小花家里共有 6 只宠物(4 + 1 = 5)。因此,两个人的宠物数量相同。所以,无法确定谁家里宠物更多。"} 236 | {"instruction": "题目:小明有5个球,他送给小红2个球,还剩多少个球?", "generate_text": "Human: 题目:小明有5个球,他送给小红2个球,还剩多少个球?\nAssistant: 小明原来有5个球,送出去了2个球后,就是5-2=3。所以小明现在手里只剩下3个球了。答案为3。"} 237 | {"instruction": "请问2+3等于几?", "generate_text": "Human: 请问2+3等于几?\nAssistant: 将两个数字相加,得到答案为5。因此,2+3=5。"} 238 | ``` 239 | 240 | generate方法的参数配置如下: 241 | 242 | ```bash 243 | max_new_tokens = 1024 244 | generation_config = dict( 245 | temperature=0.001, 246 | top_k=30, 247 | top_p=0.85, 248 | do_sample=True, 249 | num_beams=1, 250 | repetition_penalty=1.2, 251 | max_new_tokens=max_new_tokens 252 | ) 253 | ``` 254 | 255 | ## 3. Additional Notes 256 | 257 | ### 3.1 LLaMA模型的使用 258 | 259 | #### 3.1.1 facebook官方LLaMA权重转为hf格式 260 | 261 | 首先,您需要从[facebookresearch/llama](https://github.com/facebookresearch/llama)获取LLaMA模型的访问权限,下载官方检查点 262 | 263 | ```bash 264 | python training_scripts/convert_llama_weights_to_hf.py --input_dir download_official_llama_path --model_size 7B --output_dir xx/llama-7b-hf 265 | ``` 266 | 267 | 运行训练脚本时将model_name_or_path改为xx/llama-7b-hf即可 268 | 269 | #### 3.1.2 BELLE-LLaMA转为hf格式 270 | 271 | 由于LLaMA模型的使用约束,我们只能开源与原始模型的diff(如:[BELLE-LLaMA-7B-2M-enc](https://huggingface.co/BelleGroup/BELLE-LLaMA-7B-2M-enc))。当您已经从[facebookresearch/llama](https://github.com/facebookresearch/llama)获取LLaMA模型的访问权限后,可参考 https://github.com/LianjiaTech/BELLE/tree/main/models ,转换后的模型即为我们指令调优后的LLaMA模型。 272 | 273 | ## 4. 致谢 274 | 275 | 1. [Deepspeed-Chat](https://github.com/microsoft/DeepSpeedExamples) 276 | 277 | ## 5. 问题反馈 278 | 279 | 如有问题,请在GitHub Issue中提交。在提交问题前,请先查看 https://github.com/microsoft/DeepSpeedExamples/issues 中是否已出现过解决类似问题的方法。 280 | 281 | **我们的实验均在8卡A100 40G上运行,在之前的实验过程中发现在V100上运行可能会遇到问题。因此如果是在V100上运行报错,请自行查阅相关解决方案,可主要参考 [deepspeed-chat issues](https://github.com/microsoft/DeepSpeedExamples/issues)**。 282 | 283 | ## 6. FAQ 284 | 285 | 我们会持续更新FAQ,并对询问的问题进行分类。Others中给出的是我们在实验过程中遇到的一些报错的情况以及参考的解决方案 286 | 287 | - [1. 单机单卡可以训练多大参数量的模型](FAQ.md#1) 288 | - [2. 单机多卡可以训练多大参数量的模型](FAQ.md#2) 289 | - [3. 单机单卡采用LoRA可以训练多大参数量的模型](FAQ.md#3) 290 | - [4. 单机多卡采用LoRA可以训练多大参数量的模型](FAQ.md#4) 291 | - [5. 加载Llama tokenizer时存在的问题](FAQ.md#5) 292 | - [6. 加载2M的数据量需要多大的内存和多长时间](FAQ.md#6) 293 | - [7. 训练模型的生成结果非常糟糕](FAQ.md#7) 294 | - [Others](FAQ.md#Others) 295 | 296 | ## 7. 部分代码实现细节 297 | 298 | 本仓库实验代码仅对Deepspeed-Chat项目中training/step1_supervised_finetuning内的部分代码做了简单的修改。具体修改内容如下: 299 | 300 | 1. 需要在utils/data/raw_datasets.py中实现一个类,比如BelleOpenSoucreDataset,用于读取训练数据 301 | 2. 由于训练的目标是为了让模型学会回复人类指令,所以我们仅对answer文本计算loss。需要在utils/data/data_utils.py的create_dataset_split方法中修改tokenize部分,在human instruction文本部分对应的label加上-100作为mask。如果是多轮对话数据,每一轮的human instruction对应的label都会加上-100 302 | -------------------------------------------------------------------------------- /train/docs/README_en.md: -------------------------------------------------------------------------------- 1 | # BELLE Training Code 2 | 3 | | [English](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/README_en.md) | [中文](https://github.com/LianjiaTech/BELLE/blob/main/train/README.md) 4 | 5 | ## 1. Set up the environment 6 | 7 | ### 1.1 Docker image 8 | 9 | We provide a fully runnable Docker image, with the Dockerfile located in the docker folder. 10 | 11 | Considering the difficulty of building the image, we offer an image download. You can use the following command to pull our image from DockerHub and run the code in the image environment. 12 | 13 | ```bash 14 | docker pull belleagi/belle:v1.0 15 | git clone https://github.com/LianjiaTech/BELLE.git 16 | docker run -it --runtime=nvidia --shm-size="40g" -v /path/BELLE/train:/workspace/BELLE-train -v /path/huggingface_models/:/workspace/model_name_or_path belleagi/belle:v1.0 /bin/bash 17 | ``` 18 | 19 | 20 | 21 | The above command accomplishes the following: 22 | 23 | 1. Pull the docker image 24 | 2. Clone the BELLE repository 25 | 3. Mount the BELLE/train directory with the /workspace/BELLE-train directory in the Docker environment 26 | 4. Mount the huggingface_models directory with the /workspace/model_name_or_path directory in the Docker environment. The huggingface_models directory represents the save path for pre-trained language models, which stores all required pre-trained language models, such as llama-7b, bloomz-7b1-mt, etc. 27 | 5. Note: The mounted directories must be absolute paths 28 | 29 | ### 1.2 Conda 30 | 31 | If you cannot use Docker due to machine constraints or other reasons, you can also create an environment with Conda and install the required packages using pip. 32 | 33 | ```bash 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | 38 | 39 | However, installing DeepSpeed with pip may fail during installation or runtime. The [FAQ](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/FAQ.md) provides some tutorials on installing DeepSpeed and possible issues you may encounter. 40 | 41 | ## 2. Run 42 | 43 | ### 2.1 Download data 44 | 45 | ```bash 46 | wget https://huggingface.co/datasets/BelleGroup/school_math_0.25M/resolve/main/school_math_0.25M.json 47 | ``` 48 | 49 | 50 | 51 | This dataset is for math problem-solving tasks, including the problem-solving process, with a total of about 250,000 entries. Example: 52 | 53 | | Instruction | Answer | 54 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 55 | | 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天? | 一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可: 25 ÷ 9 = 2余7 也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。 因此,一箱鸡蛋能够供小明吃2天。 | 56 | | 小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果? | 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。 2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 计算过程如下: 15 - 6 = 9 所以,小明还剩下9颗糖果。 | 57 | 58 | #### 2.1.1 Convert data format 59 | 60 | ```bash 61 | python scripts/convert_to_conv_data.py --orig_data school_math_0.25M.json --write_data school_math_0.25M_conv.json --dataset_name bellemath 62 | head -n 1000 school_math_0.25M_conv.json > belleMath-dev1K.json 63 | tail -n +1001 school_math_0.25M_conv.json > belleMath.json 64 | ``` 65 | 66 | 67 | 68 | We select the first 1000 entries as the validation set, and the rest as the training set. 69 | 70 | **Note:** Currently, the repository code needs to be compatible with both single-turn and multi-turn data. Therefore, for previously open-sourced data (formatted as [instruction, output], most of the open-sourced single-turn instruction data is in this format), you need to use the convert_to_conv_data.py script to convert it to the following unified format: 71 | 72 | ```bash 73 | {"id": "bellemath42", "conversations": [{"from": "human", "value": " 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天?"}, {"from": "assistant", "value": "\n一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可:\n25 ÷ 9 = 2余7\n也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。\n因此,一箱鸡蛋能够供小明吃2天。"}]} 74 | {"id": "bellemath80", "conversations": [{"from": "human", "value": "小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果?"}, {"from": "assistant", "value": " \n1. 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。\n2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 \n计算过程如下:\n15 - 6 = 9\n所以,小明还剩下9颗糖果。"}]} 75 | ``` 76 | 77 | 78 | 79 | 80 | Other training data can be found at: https://huggingface.co/BelleGroup. Convert the format according to the above process. 81 | 82 | For multi-turn conversation data, [shareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main) is an open-source large-scale multi-turn conversation dataset. You can refer to our work: [Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://arxiv.org/pdf/2304.07854.pdf) for specific performance. 83 | 84 | The current code supports training this multi-turn conversation data. Download the data: 85 | 86 | ```bash 87 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 88 | ``` 89 | 90 | 91 | 92 | ### 2.2 Model Training 93 | 94 | The training start script is located in scripts/run.sh, and you need to modify the parameters in run.sh according to your actual requirements. 95 | 96 | ```bash 97 | bash scripts/run.sh 98 | ``` 99 | 100 | 101 | 102 | - model_name_or_path represents the pre-trained model (if it is a LLaMA model, it needs to be converted to hf format beforehand to be loaded with from_pretrained) 103 | - train_file represents the training data 104 | - validation_file represents the validation data 105 | - output_dir represents the path for saving training logs and models 106 | - cache_dir represents the path for caching data processing 107 | - cutoff_len represents the maximum input sequence length (for LLaMA models, it is recommended to set it to 1024 or above; for Bloom models, set it to 512 or above) 108 | 109 | The run.sh script contains both full parameter fine-tuning and LoRA training start commands. Here, we will briefly explain the meaning of each parameter in the start command. 110 | 111 | #### 2.2.1 Full parameter fine-tuning 112 | 113 | The following command is for full-parameter fine-tuning on a single machine with multiple cards, using deepspeed and the base model LLaMA: 114 | 115 | ```bash 116 | torchrun --nproc_per_node 8 train.py \ 117 | --model_name_or_path ${model_name_or_path} \ 118 | --llama \ 119 | --deepspeed configs/deepspeed_config.json \ 120 | --train_file ${train_file} \ 121 | --validation_file ${validation_file} \ 122 | --per_device_train_batch_size 2 \ 123 | --per_device_eval_batch_size 2 \ 124 | --gradient_accumulation_steps 4 \ 125 | --num_train_epochs 2 \ 126 | --model_max_length ${cutoff_len} \ 127 | --save_strategy "steps" \ 128 | --save_total_limit 3 \ 129 | --learning_rate 8e-6 \ 130 | --weight_decay 0.00001 \ 131 | --warmup_ratio 0.05 \ 132 | --lr_scheduler_type "cosine" \ 133 | --logging_steps 10 \ 134 | --evaluation_strategy "steps" \ 135 | --fp16 True \ 136 | --seed 1234 \ 137 | --gradient_checkpointing True \ 138 | --cache_dir ${cache_dir} \ 139 | --output_dir ${output_dir} 140 | ``` 141 | 142 | 143 | 144 | **Parameter Description** 145 | 146 | 1. If you want to train with a single card, just set nproc_per_node to 1. 147 | 2. If the pre-trained model is not LLaMA, remove --llama. If it is a LLaMA model, you need to specify --llama. This is because LLaMA models need to use LLamaTokenizer for loading, and using AutoTokenizer for llama may cause infinite recursion problems, which is related to the transformers version. 148 | 3. If the runtime environment does not support deepspeed, remove --deepspeed. 149 | 150 | Deepspeed parameter configuration can refer to: 151 | 152 | 1. https://www.deepspeed.ai/docs/config-json/ 153 | 2. https://huggingface.co/docs/accelerate/usage_guides/deepspeed 154 | 155 | Training logs and models are saved in the output_dir directory, and the file structure in the directory should be as follows: 156 | 157 | ```Arduino 158 | output_dir/ 159 | ├── checkpoint-244/ 160 | │ ├── pytorch_model.bin 161 | │ ├── config.json 162 | │ └── trainer_state.json 163 | ├── checkpoint-527/ 164 | │ ├── pytorch_model.bin 165 | │ ├── config.json 166 | │ └── trainer_state.json 167 | ├── pytorch_model.bin 168 | ├── print_log.txt 169 | └── config.json 170 | ``` 171 | 172 | 173 | 174 | trainer_state.json records the changes in loss and learning_rate. 175 | 176 | #### 2.2.2 LoRA 177 | 178 | ```bash 179 | torchrun --nproc_per_node 8 train.py \ 180 | --model_name_or_path ${model_name_or_path} \ 181 | --llama \ 182 | --use_lora True \ 183 | --use_int8_training \ 184 | --lora_config configs/lora_config_llama.json \ 185 | --train_file ${train_file} \ 186 | --validation_file ${validation_file} \ 187 | --per_device_train_batch_size 2 \ 188 | --per_device_eval_batch_size 2 \ 189 | --gradient_accumulation_steps 4 \ 190 | --num_train_epochs 2 \ 191 | --model_max_length ${cutoff_len} \ 192 | --save_strategy "steps" \ 193 | --save_total_limit 3 \ 194 | --learning_rate 8e-6 \ 195 | --weight_decay 0.00001 \ 196 | --warmup_ratio 0.05 \ 197 | --lr_scheduler_type "cosine" \ 198 | --logging_steps 10 \ 199 | --evaluation_strategy "steps" \ 200 | --fp16 True \ 201 | --seed 1234 \ 202 | --gradient_checkpointing True \ 203 | --cache_dir ${cache_dir} \ 204 | --output_dir ${output_dir} 205 | ``` 206 | 207 | 208 | 209 | **Parameter Description** 210 | 211 | - use_lora represents using LoRA for training. 212 | - use_int8_training represents using 8-bit quantization training, which can significantly reduce GPU memory usage. 213 | - lora_config provides the parameter configuration for LoRA. If training the Bloom model, change it to configs/lora_config_bloom.json. 214 | 215 | The file structure of the output_dir directory is as follows: 216 | 217 | ```Arduino 218 | output_dir/ 219 | ├── checkpoint-244/ 220 | │ ├── pytorch_model.bin 221 | │ └── trainer_state.json 222 | ├── checkpoint-527/ 223 | │ ├── pytorch_model.bin 224 | │ └── trainer_state.json 225 | ├── adapter_model.bin 226 | ├── print_log.txt 227 | └── adapter_config.json 228 | ``` 229 | 230 | 231 | 232 | **Note: The saved model adapter_model.bin after LoRA training may be an empty file. In this case, you need to copy the pytorch_model.bin saved in other checkpoint-step to the output_dir directory. For example:** 233 | 234 | ```bash 235 | cd output_dir 236 | cp checkpoint-527/pytorch_model.bin adapter_model.bin 237 | ``` 238 | 239 | 240 | 241 | **Make sure adapter_model.bin is a valid LoRA weight.** 242 | 243 | #### 2.2.3 Merging LoRA Weights 244 | 245 | If you want to merge the LoRA weights with the pre-trained model, you can run the following command: 246 | 247 | ```bash 248 | bash scripts/merge_lora.sh 249 | ``` 250 | 251 | The merged weights are saved in the output_path directory and can be loaded directly using from_pretrained later. 252 | 253 | ## 3. Inference 254 | 255 | ### 3.1 Inference 256 | 257 | If you see this section, it means you have completed the training. Now we load the trained model to verify the text generation effect. 258 | 259 | ```bash 260 | CUDA_VISIBLE_DEVICES=0 python src/inference.py \ 261 | --model_name_or_path model_name_or_path \ 262 | --ckpt_path ckpt_path \ 263 | --llama \ 264 | --use_lora 265 | ``` 266 | 267 | 268 | 269 | **Parameter Description:** 270 | 271 | - model_name_or_path is the path to the native pre-trained model. 272 | - ckpt_path is the path to the saved model after training, which is the output_dir. 273 | - llama indicates whether the base model is a LLaMA model. 274 | - use_lora indicates whether the ckpt_path is LoRA weight. 275 | 276 | **Note: The saved model adapter_model.bin after LoRA training may be an empty file. In this case, you need to copy the pytorch_model.bin saved in other checkpoint-step to the output_dir directory.** 277 | 278 | In addition, if you have already merged the LoRA weights with the pre-trained model, you only need to specify ckpt_path as the path where the merged weights are saved, and there is no need to specify use_lora. 279 | 280 | ### 3.2 WebUI 281 | 282 | We also provide a simple interactive web interface based on gradio. To start the service: 283 | 284 | ```bash 285 | CUDA_VISIBLE_DEVICES=0 python src/interface.py \ 286 | --model_name_or_path model_name_or_path \ 287 | --ckpt_path ckpt_path \ 288 | --llama \ 289 | --use_lora 290 | ``` 291 | 292 | 293 | 294 | The service access address is hostip:17860. 295 | 296 | ![webUI](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/interface.png) 297 | 298 | ## 4. Additional Notes 299 | 300 | ### 4.1 Usage of LLaMA Model 301 | 302 | #### 4.1.1 Converting official LLaMA weights from Facebook to Hugging Face format 303 | 304 | First, you need to obtain access to the LLaMA model from [facebookresearch/llama](https://github.com/facebookresearch/llama) and download the official checkpoints. 305 | 306 | ```bash 307 | python training_scripts/convert_llama_weights_to_hf.py --input_dir download_official_llama_path --model_size 7B --output_dir xx/llama-7b-hf 308 | ``` 309 | 310 | 311 | 312 | When running the training script, change the `model_name_or_path` to `xx/llama-7b-hf`. 313 | 314 | #### 4.1.2 Converting BELLE-LLaMA to Hugging Face format 315 | 316 | Due to the usage constraints of the LLaMA model, we can only open-source the diff from the original model, such as [BELLE-LLaMA-7B-2M-enc](https://huggingface.co/BelleGroup/BELLE-LLaMA-7B-2M-enc). Once you have obtained access to the LLaMA model from [facebookresearch/llama](https://github.com/facebookresearch/llama), you can refer to https://github.com/LianjiaTech/BELLE/tree/main/models to convert the model. The converted model will be our instruction-optimized LLaMA model. 317 | 318 | ## 5. Feedback and Questions 319 | 320 | If you have any questions, please submit them in GitHub Issues. Before encountering a problem, please search for a solution to a similar issue in the [FAQ](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/FAQ.md) . 321 | 322 | The above text content is in a Chinese Markdown file. Please generate the corresponding English version of the Markdown file. The contents of tables and code cells do not need to be translated and do not need to be output. -------------------------------------------------------------------------------- /train/src/entry_point/evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append('path/of/BELLE/train/src') # revise\n", 11 | "# sys.path.append('path/of/pixiu_private-main/train/src')" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "[2023-06-26 14:59:13,975] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "from utils import MultiClient\n", 29 | "ip = '...'\n", 30 | "base_port = 17860\n", 31 | "worker_addrs = [\n", 32 | " f\"http://{ip}:{base_port + i}\" for i in range(8)\n", 33 | "]" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "client = MultiClient(worker_addrs)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "GENERATION_CONFIG = [\n", 52 | " # int | float (numeric value between 0 and 1) in 'Temperature' Slider component\n", 53 | " 0.01,\n", 54 | " # int | float (numeric value between 0 and 1) in 'Top p' Slider component\n", 55 | " 0.85,\n", 56 | " # int | float (numeric value between 0 and 100) in 'Top k' Slider component\n", 57 | " 30,\n", 58 | " # int | float (numeric value between 1 and 4) in 'Beams Number' Slider component\n", 59 | " 1,\n", 60 | " # do sample\n", 61 | " True,\n", 62 | " # int | float (numeric value between 1 and 2000) in 'Max New Tokens' Slider component\n", 63 | " 1024,\n", 64 | " # int | float (numeric value between 1 and 300) in 'Min New Tokens' Slider component\n", 65 | " 1,\n", 66 | " # int | float (numeric value between 1.0 and 2.0) in 'Repetition Penalty' Slider component\n", 67 | " 1.2,\n", 68 | "]" 69 | ] 70 | }, 71 | { 72 | "attachments": {}, 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# 基础能力测试" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import pandas as pd\n", 86 | "eval_df = pd.read_json('/nfs/a100-006/hanweiguang/Projects/automatic_evaluate/eval_data.jsonl', lines=True)\n", 87 | "data = eval_df['instruction'].tolist()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# BELLE template\n", 97 | "# template = \\\n", 98 | "# \"\"\"\"Human: \n", 99 | "# {text}\n", 100 | "\n", 101 | "# Assistant: \n", 102 | "# \"\"\"\n", 103 | "\n", 104 | "# glm template\n", 105 | "template = \\\n", 106 | "\"\"\"{text}\"\"\"" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 7, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [17:22<00:00, 1.04s/it]\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "tasks = []\n", 124 | "for i in range(len(data)):\n", 125 | " tasks.append([template.format(text=data[i])] + GENERATION_CONFIG)\n", 126 | "answers = client.predict(tasks)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 9, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "1000" 138 | ] 139 | }, 140 | "execution_count": 9, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "len(answers)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 8, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/html": [ 157 | "
\n", 158 | "\n", 171 | "\n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
idinstructiongold_responsetyperesponse
01请你写出一个可以生成五个不同随机数的 Python 程序。\\n要生成五个不同的随机数,可以使用 Python 的 random 模块中的 randint(...code以下是使用 Python 的 random 模块生成五个不同随机整数的示例代码:\\n\\n``...
12回答以下问题的答案。\\n中国有多少个省?中国目前有34个省级行政区。具体分为23个省、5个自治区、4个直辖市和2个特别行政区。以下是...open qa中国有34个省级行政区,包括23个省、5个自治区、4个直辖市和2个特别行政区(香港和澳门)。
23什么是机器学习?它有哪些应用场景?\\n机器学习是一种人工智能科学的分支,其目的是使计算机系统能够自动学习和改进,而不需要严格的编程...open qa机器学习是一种人工智能领域的技术,旨在使计算机程序从数据中学习和提取模式,并利用这些知识做出...
34提供一个主题,让学生进行头脑风暴,产生关于该主题的想法。\\n如何提高城市化水平。如何提高城市化水平?\\n头脑风暴产生的想法:\\n1. 投资基础设施:城市化的基础是基础设施。...brainstorming以下是一个主题,可以激发学生的头脑风暴:\\n\\n\"创新城市化解决方案——促进可持续发展和提高...
45针对给定的主题,进行头脑风暴并记录所有想法。\\n如何提高公司的销售额?提高公司的销售额是一个涉及到多种因素的问题。下面是一些可能有用的头脑风暴提示:\\n1. 提供...brainstorming以下是一些可能有助于提高公司销售额的想法:\\n\\n1. 打造一个独特的品牌形象,包括标志、标...
\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " id instruction \\\n", 229 | "0 1 请你写出一个可以生成五个不同随机数的 Python 程序。\\n \n", 230 | "1 2 回答以下问题的答案。\\n中国有多少个省? \n", 231 | "2 3 什么是机器学习?它有哪些应用场景?\\n \n", 232 | "3 4 提供一个主题,让学生进行头脑风暴,产生关于该主题的想法。\\n如何提高城市化水平。 \n", 233 | "4 5 针对给定的主题,进行头脑风暴并记录所有想法。\\n如何提高公司的销售额? \n", 234 | "\n", 235 | " gold_response type \\\n", 236 | "0 要生成五个不同的随机数,可以使用 Python 的 random 模块中的 randint(... code \n", 237 | "1 中国目前有34个省级行政区。具体分为23个省、5个自治区、4个直辖市和2个特别行政区。以下是... open qa \n", 238 | "2 机器学习是一种人工智能科学的分支,其目的是使计算机系统能够自动学习和改进,而不需要严格的编程... open qa \n", 239 | "3 如何提高城市化水平?\\n头脑风暴产生的想法:\\n1. 投资基础设施:城市化的基础是基础设施。... brainstorming \n", 240 | "4 提高公司的销售额是一个涉及到多种因素的问题。下面是一些可能有用的头脑风暴提示:\\n1. 提供... brainstorming \n", 241 | "\n", 242 | " response \n", 243 | "0 以下是使用 Python 的 random 模块生成五个不同随机整数的示例代码:\\n\\n``... \n", 244 | "1 中国有34个省级行政区,包括23个省、5个自治区、4个直辖市和2个特别行政区(香港和澳门)。 \n", 245 | "2 机器学习是一种人工智能领域的技术,旨在使计算机程序从数据中学习和提取模式,并利用这些知识做出... \n", 246 | "3 以下是一个主题,可以激发学生的头脑风暴:\\n\\n\"创新城市化解决方案——促进可持续发展和提高... \n", 247 | "4 以下是一些可能有助于提高公司销售额的想法:\\n\\n1. 打造一个独特的品牌形象,包括标志、标... " 248 | ] 249 | }, 250 | "execution_count": 8, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "eval_df['response'] = answers\n", 257 | "eval_df[:5]" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 10, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "eval_df.to_json('/nfs/a100-006/hanweiguang/Projects/automatic_evaluate/data/chatglm2-6b.json', lines=True, orient='records', force_ascii=False)" 267 | ] 268 | }, 269 | { 270 | "attachments": {}, 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "# 博学测试" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "import json\n", 284 | "with open(\"/nfs/a100-006/hanweiguang/Projects/BELLE/data/boxue/exam_1.jsonl\") as f:\n", 285 | " data = f.readlines()\n", 286 | " data = [json.loads(val) for val in data]" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "template = \\\n", 296 | "\"\"\"Human: {type}\n", 297 | "{question}\n", 298 | "{candidates}\n", 299 | "\n", 300 | "Assistant: \n", 301 | "\"\"\"" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "tasks = []\n", 311 | "for i in range(len(data)):\n", 312 | " sample = data[i]\n", 313 | " tasks.append([template.format(\n", 314 | " question=sample['question'].strip(),\n", 315 | " candidates='\\n'.join(sample['candidates']),\n", 316 | " type=sample['type']\n", 317 | " )] + GENERATION_CONFIG)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "answers = client.predict(tasks)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "import pandas as pd\n", 336 | "preds = pd.DataFrame({\"prompt\": [task[0] for task in tasks], \"pred\": answers, \"labels\": [sample[\"answer\"] for sample in data]})\n", 337 | "preds.to_excel('../../data/boxue/pred_1.xlsx')" 338 | ] 339 | }, 340 | { 341 | "attachments": {}, 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "# GLM DEBUG" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 14, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stderr", 355 | "output_type": "stream", 356 | "text": [ 357 | "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00, 1.65s/it]\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "from transformers import AutoTokenizer, AutoModel\n", 363 | "tokenizer = AutoTokenizer.from_pretrained(\"/nfs/v100-022/xytian/chatglm2-6b\", trust_remote_code=True)\n", 364 | "model = AutoModel.from_pretrained('/nfs/v100-022/xytian/chatglm2-6b', trust_remote_code=True)\n", 365 | "model = model.half().cuda().eval()" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 15, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "(None, None, 2)" 377 | ] 378 | }, 379 | "execution_count": 15, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "tokenizer.eos_token_id, tokenizer.bos_token_id, tokenizer.pad_token_id" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 18, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "name": "stderr", 395 | "output_type": "stream", 396 | "text": [ 397 | "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n" 398 | ] 399 | }, 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "('你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。',\n", 404 | " [('你好', '你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。')])" 405 | ] 406 | }, 407 | "execution_count": 18, 408 | "metadata": {}, 409 | "output_type": "execute_result" 410 | } 411 | ], 412 | "source": [ 413 | "response, history = model.chat(tokenizer, \"你好\", history=[])\n", 414 | "response, history" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 19, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "from transformers import GenerationConfig" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "generation_config = GenerationConfig(\n", 433 | " temperature=0.01,\n", 434 | " top_p=0.85,\n", 435 | " top_k=30,\n", 436 | " num_beams=1,\n", 437 | " max_new_tokens=1024, # max_length=max_new_tokens+input_sequence\n", 438 | " min_new_tokens=1, # min_length=min_new_tokens+input_sequence\n", 439 | " repetition_penalty=1.2,\n", 440 | " do_sample=True,\n", 441 | " pad_token_id = tokenizer.pad_token_id\n", 442 | ")\n", 443 | "model.chat(tokenizer, \"你好\", **generation_config.to_dict())" 444 | ] 445 | } 446 | ], 447 | "metadata": { 448 | "kernelspec": { 449 | "display_name": "Python 3 (ipykernel)", 450 | "language": "python", 451 | "name": "python3" 452 | }, 453 | "orig_nbformat": 4 454 | }, 455 | "nbformat": 4, 456 | "nbformat_minor": 2 457 | } --------------------------------------------------------------------------------