├── data
└── readme.txt
├── train
├── docs
│ ├── interface.png
│ ├── FAQ.md
│ └── README_en.md
├── requirements.txt
├── configs
│ ├── lora_config_bloom.json
│ ├── lora_config_llama.json
│ ├── deepspeed_config.json
│ ├── deepspeed_config_flops_profile.json
│ ├── deepspeed_config_stage3.json
│ └── deepspeed_config_stage3_flops_profile.json
├── scripts
│ ├── download_vicuna_until_success.sh
│ ├── run_inference.sh
│ ├── run_interface2.sh
│ ├── find_libstdc++.sh
│ ├── run_multi_backend.sh
│ ├── merge_tokenizers.py
│ ├── multinode_run.sh
│ ├── convert_to_conv_data.py
│ ├── run_sft.sh
│ ├── run_pt.sh
│ ├── run_multi_backend.py
│ ├── debug.sh
│ └── convert_llama_weights_to_hf.py
├── debug
│ ├── templates
│ │ ├── error.html
│ │ ├── index.html
│ │ └── output.html
│ └── app.py
├── docker
│ ├── upload_image.sh
│ ├── build_dockerfile_upon_transfermers.sh
│ ├── README.md
│ ├── belle.dockerfile
│ ├── docker_run.sh
│ └── transformers.dockerfile
├── dschat_train_v1
│ ├── training_scripts
│ │ ├── single_node
│ │ │ ├── run_FT.sh
│ │ │ └── run_LoRA.sh
│ │ ├── single_gpu
│ │ │ ├── run_FT.sh
│ │ │ └── run_LoRA.sh
│ │ ├── convert_to_conv_data.py
│ │ └── convert_llama_weights_to_hf.py
│ ├── utils
│ │ ├── ds_utils.py
│ │ ├── model
│ │ │ └── model_utils.py
│ │ ├── utils.py
│ │ ├── module
│ │ │ └── lora.py
│ │ └── data
│ │ │ └── raw_datasets.py
│ ├── docker
│ │ └── Dockerfile
│ ├── FAQ.md
│ ├── reproduce_our_papers
│ │ └── Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md
│ ├── prompt_eval.py
│ └── README.md
├── src
│ ├── merge_llama_with_lora.py
│ ├── entry_point
│ │ ├── inference.py
│ │ ├── interface.py
│ │ └── evaluation.ipynb
│ ├── utils.py
│ ├── sample_generator.py
│ ├── callbacks.py
│ ├── apply_delta.py
│ └── trainer.py
└── reproduce_our_papers
│ └── Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md
├── requirements.txt
└── README.md
/data/readme.txt:
--------------------------------------------------------------------------------
1 | Training data for CALM-7B
--------------------------------------------------------------------------------
/train/docs/interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dai-shen/CALM-train/HEAD/train/docs/interface.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | rouge_score
3 | fire
4 | openai
5 | transformers
6 | gensim
7 | peft
8 | datasets
9 | bitsandbytes
10 | deepspeed
11 |
--------------------------------------------------------------------------------
/train/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | bitsandbytes
3 | datasets
4 | fire
5 | huggingface_hub
6 | torch
7 | tqdm
8 | transformers
9 | deepspeed
10 | gradio
11 | xformers
--------------------------------------------------------------------------------
/train/configs/lora_config_bloom.json:
--------------------------------------------------------------------------------
1 | {
2 | "lora_r": 16,
3 | "lora_alpha": 32,
4 | "lora_dropout": 0.05,
5 | "lora_target_modules": [
6 | "query_key_value"
7 | ]
8 | }
9 |
--------------------------------------------------------------------------------
/train/scripts/download_vicuna_until_success.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while true; do
4 | python src/apply_delta.py --base 'decapoda-research/llama-7b-hf' --target './weights/vicuna-7b' --delta 'lmsys/vicuna-7b-delta-v1.1' && break
5 | sleep 1
6 | done
7 |
--------------------------------------------------------------------------------
/train/configs/lora_config_llama.json:
--------------------------------------------------------------------------------
1 | {
2 | "lora_r": 16,
3 | "lora_alpha": 32,
4 | "lora_dropout": 0.05,
5 | "lora_target_modules": [
6 | "q_proj",
7 | "k_proj",
8 | "v_proj",
9 | "o_proj",
10 | "down_proj",
11 | "gate_proj",
12 | "up_proj"
13 | ]
14 | }
15 |
--------------------------------------------------------------------------------
/train/scripts/run_inference.sh:
--------------------------------------------------------------------------------
1 | model_name_or_path=/media/data1/fengduanyu/llama-2-7b-chat-T/
2 | ckpt_path=/home/daiyf/daiyf/PIXIU-train/CRA__model_2/model_14956
3 |
4 |
5 | CUDA_VISIBLE_DEVICES=2 python src/entry_point/inference.py \
6 | --model_name_or_path $model_name_or_path \
7 | --ckpt_path $ckpt_path \
8 | --llama
9 |
10 |
--------------------------------------------------------------------------------
/train/scripts/run_interface2.sh:
--------------------------------------------------------------------------------
1 | model_name_or_path=/media/data1/fengduanyu/llama-2-7b-chat-T/
2 | ckpt_path=/home/daiyf/daiyf/PIXIU-train/CRA__model_2/model_14956
3 |
4 |
5 | CUDA_VISIBLE_DEVICES=1 python src/entry_point/interface.py \
6 | --model_name_or_path $model_name_or_path \
7 | --ckpt_path $ckpt_path \
8 | --llama
9 |
10 |
--------------------------------------------------------------------------------
/train/scripts/find_libstdc++.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | files=$(find / -name 'libstdc++.so.6*' 2>/dev/null)
3 | total=$(echo "$files" | wc -l)
4 | counter=1
5 |
6 | echo "$total files found."
7 |
8 | for file in $files; do
9 | echo "Processing file $counter/$total: $file"
10 | if strings "$file" 2>/dev/null | grep -q 'CXXABI_1.3.9'; then
11 | echo "File matches: $file"
12 | fi
13 | ((counter++))
14 | done
15 |
--------------------------------------------------------------------------------
/train/debug/templates/error.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Error
5 |
16 |
17 |
18 | Error
19 | {{ message }}
20 |
21 |
--------------------------------------------------------------------------------
/train/docker/upload_image.sh:
--------------------------------------------------------------------------------
1 | export https_proxy=http://127.0.0.1:65530
2 | export http_proxy=http://127.0.0.1:65530
3 | export all_proxy=socks5://127.0.0.1:65530
4 | docker_user=tothemoon
5 |
6 | # docker tag transformers:ds $docker_user/transformers:ds_$(date +%Y%m%d)
7 | # docker push $docker_user/transformers:ds_$(date +%Y%m%d)
8 | docker tag belle:$(date +%Y%m%d) $docker_user/belle:$(date +%Y%m%d)
9 | docker push $docker_user/belle:$(date +%Y%m%d)
--------------------------------------------------------------------------------
/train/scripts/run_multi_backend.sh:
--------------------------------------------------------------------------------
1 | model_name_or_path='...'
2 | ckpt_path='...'
3 | export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
4 | export MKL_SERVICE_FORCE_INTEL='1'
5 |
6 | # python scripts/run_multi_backend.py \
7 | # --command "python ../src/entry_point/interface.py --model_name_or_path $model_name_or_path --ckpt_path $ckpt_path"
8 |
9 | python scripts/run_multi_backend.py \
10 | --command "python ../src/entry_point/interface.py --model_name_or_path $model_name_or_path --ckpt_path $ckpt_path --use_lora"
11 |
--------------------------------------------------------------------------------
/train/docker/build_dockerfile_upon_transfermers.sh:
--------------------------------------------------------------------------------
1 | export https_proxy=http://127.0.0.1:65530
2 | export http_proxy=http://127.0.0.1:65530
3 | export all_proxy=socks5://127.0.0.1:65530
4 |
5 | wget https://raw.githubusercontent.com/huggingface/transformers/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile -O transformers.dockerfile
6 | docker build --network host --build-arg HTTP_PROXY=$http_proxy -t transformers:ds -f transformers.dockerfile .
7 | docker build --network host --build-arg HTTP_PROXY=$http_proxy -t belle:$(date +%Y%m%d) -f belle.dockerfile .
8 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/training_scripts/single_node/run_FT.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 |
9 | rm -rf output/
10 | mkdir -p $OUTPUT
11 | echo $OUTPUT
12 | echo $ZERO_STAGE
13 | data_output_path=$OUTPUT/data_files
14 |
15 | #BelleGroup/BELLE-7B-2M
16 | #/workspace/model_name_or_path/hf_llama_7b
17 |
18 | deepspeed main.py \
19 | --sft_only_data_path belleMath.json \
20 | --model_name_or_path /nfs/v100-022/pretrained_ckpt/hf_llama_7b/ \
21 | --per_device_train_batch_size 2 \
22 | --per_device_eval_batch_size 1 \
23 | --max_seq_len 1024 \
24 | --learning_rate 5e-6 \
25 | --weight_decay 0.0001 \
26 | --num_train_epochs 2 \
27 | --gradient_accumulation_steps 4 \
28 | --lr_scheduler_type cosine \
29 | --num_warmup_steps 100 \
30 | --seed 1234 \
31 | --gradient_checkpointing \
32 | --zero_stage $ZERO_STAGE \
33 | --deepspeed \
34 | --output_dir $OUTPUT \
35 | --data_output_path $data_output_path \
36 | # &> $OUTPUT/training.log
37 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/training_scripts/single_gpu/run_FT.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | OUTPUT=$1
7 | ZERO_STAGE=$2
8 |
9 | rm -rf output/
10 | rm -rf $OUTPUT
11 | mkdir -p $OUTPUT
12 | echo $OUTPUT
13 | echo $ZERO_STAGE
14 | data_output_path=$OUTPUT/data_files
15 | #bigscience/bloomz-1b7
16 | #facebook/opt-1.3b
17 | #bigscience/bloomz-560m
18 |
19 | deepspeed --num_gpus 1 main.py \
20 | --sft_only_data_path belleMath.json \
21 | --model_name_or_path bigscience/bloomz-560m \
22 | --per_device_train_batch_size 8 \
23 | --per_device_eval_batch_size 2 \
24 | --max_seq_len 1024 \
25 | --learning_rate 5e-5 \
26 | --weight_decay 0.0001 \
27 | --num_train_epochs 3 \
28 | --gradient_accumulation_steps 8 \
29 | --lr_scheduler_type cosine \
30 | --num_warmup_steps 100 \
31 | --seed 1234 \
32 | --gradient_checkpointing \
33 | --zero_stage $ZERO_STAGE \
34 | --deepspeed \
35 | --output_dir $OUTPUT \
36 | --data_output_path $data_output_path \
37 | # &> $OUTPUT/training.log
38 |
--------------------------------------------------------------------------------
/train/scripts/merge_tokenizers.py:
--------------------------------------------------------------------------------
1 | ''' Train tokenizer'''
2 | import sentencepiece as spm
3 |
4 | spm.SentencePieceTrainer.train(input='/path/to/input_text',
5 | model_prefix='belle',
6 | model_type='bpe',
7 | vocab_size=25000,
8 | character_coverage=0.9995)
9 |
10 | ''' Merge tokenizer '''
11 | import sentencepiece_model_pb2 as model
12 | orig_model_path = '/path/to/llama/tokenizer.model'
13 | belle_model_path = '/path/to/belle/belle.model'
14 | orig_m = model.ModelProto()
15 | belle_m = model.ModelProto()
16 | orig_m.ParseFromString(open(orig_model_path, "rb").read())
17 | belle_m.ParseFromString(open(belle_model_path, "rb").read())
18 | print(len(orig_m.pieces), len(belle_m.pieces))
19 | orig_pieces = []
20 | for piece in orig_m.pieces:
21 | orig_pieces.append(piece.piece)
22 | for piece in belle_m.pieces:
23 | if piece.piece not in orig_pieces:
24 | orig_m.pieces.append(piece)
25 | orig_pieces.append(piece.piece)
26 |
27 | print(len(orig_m.pieces))
28 | save_vocab_path = '/path/to/merge_tokenizer/tokenizer.model'
29 | with open(save_vocab_path, 'wb') as f:
30 | f.write(orig_m.SerializeToString())
--------------------------------------------------------------------------------
/train/scripts/multinode_run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | node_rank=$1
4 | echo ${node_rank}
5 |
6 | model_name_or_path=/path_to_llm/hf_llama_7b/ # or bloomz-7b1-mt
7 |
8 | train_file=belleMath.json
9 | validation_file=belleMath-dev1K.json
10 | output_dir=saved_models
11 | mkdir -p ${output_dir}
12 |
13 | cache_dir=hf_cache_dir
14 | mkdir -p ${cache_dir}
15 | cutoff_len=1024
16 |
17 | master_addr="10.111.112.223"
18 |
19 | # #Multi-node
20 | torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_port 14545 --node_rank ${node_rank} src/entry_point/sft_train.py \
21 | --model_name_or_path ${model_name_or_path} \
22 | --llama \
23 | --deepspeed configs/deepspeed_config.json \
24 | --train_file ${train_file} \
25 | --validation_file ${validation_file} \
26 | --per_device_train_batch_size 2 \
27 | --per_device_eval_batch_size 1 \
28 | --gradient_accumulation_steps 1 \
29 | --num_train_epochs 2 \
30 | --model_max_length ${cutoff_len} \
31 | --save_strategy "steps" \
32 | --save_total_limit 3 \
33 | --learning_rate 8e-6 \
34 | --weight_decay 0.00001 \
35 | --warmup_ratio 0.05 \
36 | --lr_scheduler_type "cosine" \
37 | --logging_steps 10 \
38 | --evaluation_strategy "steps" \
39 | --fp16 \
40 | --seed 1234 \
41 | --gradient_checkpointing \
42 | --cache_dir ${cache_dir} \
43 | --output_dir ${output_dir}
44 |
--------------------------------------------------------------------------------
/train/configs/deepspeed_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "zero_optimization": {
3 | "stage": 2,
4 | "allgather_partitions": true,
5 | "allgather_bucket_size": 2e8,
6 | "reduce_scatter": true,
7 | "reduce_bucket_size": "auto",
8 | "overlap_comm": true,
9 | "contiguous_gradients": true
10 | },
11 | "scheduler": {
12 | "type": "WarmupDecayLR",
13 | "params": {
14 | "warmup_min_lr": "auto",
15 | "warmup_max_lr": "auto",
16 | "warmup_num_steps": "auto",
17 | "total_num_steps": "auto"
18 | }
19 | },
20 | "fp16": {
21 | "enabled": true,
22 | "auto_cast": false,
23 | "loss_scale": 0,
24 | "loss_scale_window": 1000,
25 | "initial_scale_power": 32,
26 | "hysteresis": 2,
27 | "min_loss_scale": 1
28 | },
29 | "bf16": {
30 | "enabled": false
31 | },
32 | "optimizer": {
33 | "type": "AdamW",
34 | "params": {
35 | "lr": "auto",
36 | "weight_decay": "auto",
37 | "betas": "auto",
38 | "eps": "auto",
39 | "torch_adam": true,
40 | "adam_w_mode": true
41 | }
42 | },
43 | "gradient_accumulation_steps": "auto",
44 | "gradient_clipping": "auto",
45 | "steps_per_print": 1000,
46 | "train_batch_size": "auto",
47 | "train_micro_batch_size_per_gpu": "auto",
48 | "wall_clock_breakdown": false
49 | }
50 |
--------------------------------------------------------------------------------
/train/docker/README.md:
--------------------------------------------------------------------------------
1 | # docker镜像
2 | ## 国内环境自行替换代理
3 | ```bash
4 | export https_proxy=http://127.0.0.1:65530
5 | export http_proxy=http://127.0.0.1:65530
6 | export all_proxy=socks5://127.0.0.1:65530
7 | ```
8 |
9 | ## 构建镜像
10 | ```bash
11 | sudo bash build_dockerfile_upon_transfermers.sh
12 | ```
13 |
14 | ## 上传镜像到dockerhub
15 | ```bash
16 | sudo bash upload_image.sh
17 | ```
18 |
19 | ## 下载镜像
20 | 已经构建好镜像,无需自行构建
21 | ```bash
22 | sudo docker tothemoon/belle:20230728
23 | ```
24 | belle镜像中包含sshd,可以远程连接到容器内部
25 |
26 | ## 运行镜像
27 | ### 1. 参考[nvidia安装说明](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)安装container-toolkit
28 | ### 2. 自行创建ssh密钥
29 | ### 3. 运行容器
30 | ```bash
31 | sudo bash docker_run.sh
32 | ```
33 | `docker_run.sh`文件主要内容如下
34 | ```
35 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
36 | --network host \
37 | [--env https_proxy=$https_proxy \]
38 | [--env http_proxy=$http_proxy \]
39 | [--env all_proxy=$all_proxy \]
40 | --env HF_HOME=$hf_home \
41 | -it [--rm] \
42 | --name belle \
43 | -v $belle_path:$belle_path \
44 | -v $hf_home:$hf_home \
45 | -v $ssh_pub_key:/root/.ssh/authorized_keys \
46 | -w $workdir \
47 | $docker_user/belle:$tag \
48 | [--sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"]
49 | ```
50 | `[]`中内容可忽略
51 | - `--rm`:容器退出时销毁,如果长期在容器中工作,可忽略
52 | - `--sshd_port`:sshd监听端口,默认是22001
53 | - `--cmd`:容器要执行的命令`"echo 'Hello, world!' && /bin/bash"`,可忽略
54 | - `hf_home`:huggingface缓存目录
55 | - `$ssh_pub_key`:sshd公钥目录
--------------------------------------------------------------------------------
/train/dschat_train_v1/training_scripts/single_node/run_LoRA.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | #facebook/opt-1.3b
7 | # Note that usually LoRA needs to use larger learning rate
8 | OUTPUT_PATH=$1
9 | ZERO_STAGE=$2
10 | echo $OUTPUT_PATH
11 | echo $ZERO_STAGE
12 | rm -rf output/
13 | mkdir -p $OUTPUT_PATH
14 |
15 |
16 | # model_name_or_path=/workspace/model_name_or_path/hf_llama_7b
17 | # lora_module_name="q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj"
18 | # If the model is Bloom, lora_module_name should be
19 | model_name_or_path=/workspace/model_name_or_path/bloomz-7b1-mt
20 | lora_module_name="query_key_value,mlp"
21 |
22 | echo ${lora_module_name}
23 |
24 | deepspeed main.py \
25 | --sft_only_data_path belleMath.json \
26 | --eval_data_file belleMath-dev1K.json \
27 | --data_split 10,0,0 \
28 | --model_name_or_path ${model_name_or_path} \
29 | --per_device_train_batch_size 4 \
30 | --per_device_eval_batch_size 1 \
31 | --max_seq_len 512 \
32 | --learning_rate 3e-4 \
33 | --weight_decay 0. \
34 | --num_train_epochs 5 \
35 | --gradient_accumulation_steps 1 \
36 | --lr_scheduler_type cosine \
37 | --num_warmup_steps 100 \
38 | --seed 1234 \
39 | --gradient_checkpointing \
40 | --zero_stage $ZERO_STAGE \
41 | --lora_dim 16 \
42 | --lora_alpha 16 \
43 | --lora_droppout 0.05 \
44 | --lora_module_name ${lora_module_name} \
45 | --deepspeed \
46 | --output_dir $OUTPUT_PATH \
47 | # &> $OUTPUT_PATH/training.log
48 |
--------------------------------------------------------------------------------
/train/src/merge_llama_with_lora.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from peft import PeftModel
5 | from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
6 |
7 | # Ref: https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py
8 |
9 | def apply_lora(model_name_or_path, output_path, lora_path):
10 | print(f"Loading the base model from {model_name_or_path}")
11 | base = AutoModelForCausalLM.from_pretrained(
12 | model_name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
13 | )
14 | if args.llama:
15 | base_tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path)
16 | else:
17 | base_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
18 |
19 | print(f"Loading the LoRA adapter from {lora_path}")
20 |
21 | lora_model = PeftModel.from_pretrained(
22 | base,
23 | lora_path,
24 | torch_dtype=torch.float16,
25 | )
26 |
27 | print("Applying the LoRA")
28 | model = lora_model.merge_and_unload()
29 |
30 | print(f"Saving the target model to {output_path}")
31 | model.save_pretrained(output_path)
32 | base_tokenizer.save_pretrained(output_path)
33 |
34 |
35 | if __name__ == "__main__":
36 | parser = argparse.ArgumentParser()
37 | parser.add_argument("--model_name_or_path", type=str, required=True)
38 | parser.add_argument("--output_path", type=str, required=True)
39 | parser.add_argument("--lora_path", type=str, required=True)
40 | parser.add_argument("--llama", action="store_true", required=True)
41 |
42 | args = parser.parse_args()
43 |
44 | apply_lora(args.model_name_or_path, args.output_path, args.lora_path)
45 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/training_scripts/single_gpu/run_LoRA.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | # DeepSpeed Team
6 | #facebook/opt-1.3b
7 | # Note that usually LoRA needs to use larger learning rate
8 | #/nfs/v100-022/jiyunjie/anaconda3/envs/llamalora/
9 | #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nfs/v100-022/jiyunjie/anaconda3/envs/llamalora/lib/
10 |
11 | OUTPUT_PATH=$1
12 | ZERO_STAGE=$2
13 |
14 | rm -rf output/
15 | mkdir -p $OUTPUT_PATH
16 | echo $OUTPUT_PATH
17 | echo $ZERO_STAGE
18 |
19 | model_name_or_path=/workspace/model_name_or_path/hf_llama_7b
20 | lora_module_name="q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj"
21 | # If the model is Bloom, lora_module_name should be
22 | # model_name_or_path=/workspace/model_name_or_path/bloomz-7b1-mt
23 | # lora_module_name="query_key_value,mlp"
24 |
25 | echo ${lora_module_name}
26 | echo ${model_name_or_path}
27 |
28 | deepspeed --num_gpus 1 main.py \
29 | --sft_only_data_path belleMath.json \
30 | --data_split 10,0,0 \
31 | --model_name_or_path ${model_name_or_path} \
32 | --per_device_train_batch_size 2 \
33 | --per_device_eval_batch_size 1 \
34 | --max_seq_len 1024 \
35 | --learning_rate 3e-4 \
36 | --weight_decay 0. \
37 | --num_train_epochs 5 \
38 | --gradient_accumulation_steps 1 \
39 | --lr_scheduler_type cosine \
40 | --num_warmup_steps 100 \
41 | --seed 1234 \
42 | --gradient_checkpointing \
43 | --zero_stage $ZERO_STAGE \
44 | --lora_dim 16 \
45 | --lora_alpha 16 \
46 | --lora_droppout 0.05 \
47 | --lora_module_name ${lora_module_name} \
48 | --deepspeed \
49 | --output_dir $OUTPUT_PATH \
50 | # &> $OUTPUT_PATH/training.log
51 |
--------------------------------------------------------------------------------
/train/scripts/convert_to_conv_data.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import datetime
4 | import os
5 |
6 | '''
7 | orig_data: {"instruction": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?", "input": "", "output": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"}
8 | convert: {
9 | "id": xxx,
10 | "conversations":[
11 | {"from": "human", "value": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?"},
12 | {"from": "assistant", "value": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"},
13 | ]
14 | }
15 | '''
16 |
17 | def main():
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument(
20 | "--orig_data",
21 | )
22 | parser.add_argument(
23 | "--write_data",
24 | )
25 | parser.add_argument(
26 | "--dataset_name",
27 | )
28 | args = parser.parse_args()
29 | f_write = open(args.write_data,"w")
30 | with open(args.orig_data) as f:
31 | lines = f.readlines()
32 | num_id = 1
33 | for line in lines:
34 | data = json.loads(line)
35 | conversations = [{"from": "human", "value": data['instruction']+data['input']},{"from": "assistant", "value": data['output']}]
36 | # conversations = [{"from": "human", "value": data['input']},{"from": "assistant", "value": data['target']}]
37 | uniq_id = data['id'] if "id" in data else args.dataset_name+"-"+str(num_id)
38 | item = {"id":uniq_id, "conversations": conversations}
39 | f_write.write(json.dumps(item, ensure_ascii=False)+"\n")
40 | num_id += 1
41 | f_write.close()
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
--------------------------------------------------------------------------------
/train/configs/deepspeed_config_flops_profile.json:
--------------------------------------------------------------------------------
1 | {
2 | "flops_profiler": {
3 | "enabled": true,
4 | "profile_step": 50,
5 | "module_depth": -1,
6 | "top_modules": 1,
7 | "detailed": true,
8 | "output_file": "profile.log"
9 | },
10 | "zero_optimization": {
11 | "stage": 2,
12 | "allgather_partitions": true,
13 | "allgather_bucket_size": 2e8,
14 | "reduce_scatter": true,
15 | "reduce_bucket_size": "auto",
16 | "overlap_comm": true,
17 | "contiguous_gradients": true
18 | },
19 | "scheduler": {
20 | "type": "WarmupDecayLR",
21 | "params": {
22 | "warmup_min_lr": "auto",
23 | "warmup_max_lr": "auto",
24 | "warmup_num_steps": "auto",
25 | "total_num_steps": "auto"
26 | }
27 | },
28 | "fp16": {
29 | "enabled": false,
30 | "auto_cast": false,
31 | "loss_scale": 0,
32 | "loss_scale_window": 1000,
33 | "initial_scale_power": 32,
34 | "hysteresis": 2,
35 | "min_loss_scale": 1
36 | },
37 | "bf16": {
38 | "enabled": true
39 | },
40 | "optimizer": {
41 | "type": "AdamW",
42 | "params": {
43 | "lr": "auto",
44 | "weight_decay": "auto",
45 | "betas": "auto",
46 | "eps": "auto",
47 | "torch_adam": true,
48 | "adam_w_mode": true
49 | }
50 | },
51 | "gradient_accumulation_steps": "auto",
52 | "gradient_clipping": "auto",
53 | "steps_per_print": 1000,
54 | "train_batch_size": "auto",
55 | "train_micro_batch_size_per_gpu": "auto",
56 | "wall_clock_breakdown": false
57 | }
--------------------------------------------------------------------------------
/train/dschat_train_v1/training_scripts/convert_to_conv_data.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import datetime
4 | import os
5 |
6 | '''
7 | orig_data: {"instruction": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?", "input": "", "output": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"}
8 | convert: {
9 | "id": xxx,
10 | "conversations":[
11 | {"from": "human", "value": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?"},
12 | {"from": "assistant", "value": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"},
13 | ]
14 | }
15 | '''
16 |
17 | def main():
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument(
20 | "--orig_data",
21 | )
22 | parser.add_argument(
23 | "--write_data",
24 | )
25 | parser.add_argument(
26 | "--dataset_name",
27 | )
28 | args = parser.parse_args()
29 | f_write = open(args.write_data,"w")
30 | with open(args.orig_data) as f:
31 | lines = f.readlines()
32 | num_id = 1
33 | for line in lines:
34 | data = json.loads(line)
35 | conversations = [{"from": "human", "value": data['instruction']+data['input']},{"from": "assistant", "value": data['output']}]
36 | # conversations = [{"from": "human", "value": data['input']},{"from": "assistant", "value": data['target']}]
37 | uniq_id = data['id'] if "id" in data else args.dataset_name+"-"+str(num_id)
38 | item = {"id":uniq_id, "conversations": conversations}
39 | f_write.write(json.dumps(item, ensure_ascii=False)+"\n")
40 | num_id += 1
41 | f_write.close()
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
--------------------------------------------------------------------------------
/train/configs/deepspeed_config_stage3.json:
--------------------------------------------------------------------------------
1 | {
2 | "zero_optimization": {
3 | "stage": 3,
4 | "overlap_comm": true,
5 | "contiguous_gradients": true,
6 | "sub_group_size": 0,
7 | "reduce_bucket_size": "auto",
8 | "stage3_prefetch_bucket_size": "auto",
9 | "stage3_param_persistence_threshold": "auto",
10 | "stage3_max_live_parameters": 0,
11 | "stage3_max_reuse_distance": 0,
12 | "stage3_gather_16bit_weights_on_model_save": true,
13 | "offload_param": {
14 | "device": "cpu"
15 | },
16 | "offload_optimizer": {
17 | "device": "cpu"
18 | }
19 | },
20 | "scheduler": {
21 | "type": "WarmupDecayLR",
22 | "params": {
23 | "warmup_min_lr": "auto",
24 | "warmup_max_lr": "auto",
25 | "warmup_num_steps": "auto",
26 | "total_num_steps": "auto"
27 | }
28 | },
29 | "fp16": {
30 | "enabled": false,
31 | "auto_cast": false,
32 | "loss_scale": 0,
33 | "loss_scale_window": 1000,
34 | "initial_scale_power": 32,
35 | "hysteresis": 2,
36 | "min_loss_scale": 1
37 | },
38 | "bf16": {
39 | "enabled": true
40 | },
41 | "optimizer": {
42 | "type": "AdamW",
43 | "params": {
44 | "lr": "auto",
45 | "weight_decay": "auto",
46 | "betas": "auto",
47 | "eps": "auto",
48 | "torch_adam": true,
49 | "adam_w_mode": true
50 | }
51 | },
52 | "gradient_accumulation_steps": "auto",
53 | "gradient_clipping": "auto",
54 | "steps_per_print": 1000,
55 | "train_batch_size": "auto",
56 | "train_micro_batch_size_per_gpu": "auto",
57 | "wall_clock_breakdown": false
58 | }
59 |
--------------------------------------------------------------------------------
/train/configs/deepspeed_config_stage3_flops_profile.json:
--------------------------------------------------------------------------------
1 | {
2 | "flops_profiler": {
3 | "enabled": true,
4 | "profile_step": 50,
5 | "module_depth": -1,
6 | "top_modules": 1,
7 | "detailed": true,
8 | "output_file": "profile.log"
9 | },
10 | "zero_optimization": {
11 | "stage": 3,
12 | "overlap_comm": true,
13 | "contiguous_gradients": true,
14 | "sub_group_size": 0,
15 | "reduce_bucket_size": "auto",
16 | "stage3_prefetch_bucket_size": "auto",
17 | "stage3_param_persistence_threshold": "auto",
18 | "stage3_max_live_parameters": 0,
19 | "stage3_max_reuse_distance": 0,
20 | "stage3_gather_16bit_weights_on_model_save": true,
21 | "offload_param": {
22 | "device": "cpu"
23 | },
24 | "offload_optimizer": {
25 | "device": "cpu"
26 | }
27 | },
28 | "scheduler": {
29 | "type": "WarmupDecayLR",
30 | "params": {
31 | "warmup_min_lr": "auto",
32 | "warmup_max_lr": "auto",
33 | "warmup_num_steps": "auto",
34 | "total_num_steps": "auto"
35 | }
36 | },
37 | "fp16": {
38 | "enabled": false,
39 | "auto_cast": false,
40 | "loss_scale": 0,
41 | "loss_scale_window": 1000,
42 | "initial_scale_power": 32,
43 | "hysteresis": 2,
44 | "min_loss_scale": 1
45 | },
46 | "bf16": {
47 | "enabled": true
48 | },
49 | "optimizer": {
50 | "type": "AdamW",
51 | "params": {
52 | "lr": "auto",
53 | "weight_decay": "auto",
54 | "betas": "auto",
55 | "eps": "auto",
56 | "torch_adam": true,
57 | "adam_w_mode": true
58 | }
59 | },
60 | "gradient_accumulation_steps": "auto",
61 | "gradient_clipping": "auto",
62 | "steps_per_print": 1000,
63 | "train_batch_size": "auto",
64 | "train_micro_batch_size_per_gpu": "auto",
65 | "wall_clock_breakdown": false
66 | }
--------------------------------------------------------------------------------
/train/docker/belle.dockerfile:
--------------------------------------------------------------------------------
1 | FROM transformers:ds
2 | LABEL maintainer="BELLE"
3 | WORKDIR /workspace
4 |
5 | RUN apt update
6 | RUN apt install -y htop
7 | RUN apt install -y screen
8 | RUN apt install -y openssh-server \
9 | && sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config \
10 | && sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config \
11 | && sed -i "s/#PasswordAuthentication yes/PasswordAuthentication no/" /etc/ssh/sshd_config \
12 | && mkdir -p /run/sshd
13 |
14 | RUN python3 -m pip install -U --no-cache-dir pip
15 | RUN python3 -m pip install -U --no-cache-dir peft
16 | RUN python3 -m pip install -U --no-cache-dir gradio
17 | RUN python3 -m pip install -U --no-cache-dir pudb
18 | RUN python3 -m pip install -U --no-cache-dir xformers
19 | RUN python3 -m pip install -U --no-cache-dir bitsandbytes
20 | RUN python3 -m pip install -U --no-build-isolation --no-cache-dir flash-attn
21 |
22 | RUN mkdir -p /scripts && echo -e '#!/bin/bash\n\
23 | SSHD_PORT=22001\n\
24 | CMD_TO_RUN=""\n\
25 | while (( "$#" )); do\n\
26 | case "$1" in\n\
27 | --sshd_port)\n\
28 | if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then\n\
29 | SSHD_PORT=$2\n\
30 | shift 2\n\
31 | else\n\
32 | echo "Error: Argument for $1 is missing" >&2\n\
33 | exit 1\n\
34 | fi\n\
35 | ;;\n\
36 | --cmd)\n\
37 | if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then\n\
38 | CMD_TO_RUN=$2\n\
39 | shift 2\n\
40 | else\n\
41 | echo "Error: Argument for $1 is missing" >&2\n\
42 | exit 1\n\
43 | fi\n\
44 | ;;\n\
45 | -*|--*=) \n\
46 | echo "Error: Unsupported flag $1" >&2\n\
47 | exit 1\n\
48 | ;;\n\
49 | *) \n\
50 | shift\n\
51 | ;;\n\
52 | esac\n\
53 | done\n\
54 | sed -i "s/#Port 22/Port $SSHD_PORT/" /etc/ssh/sshd_config\n\
55 | /usr/sbin/sshd\n\
56 | if [ -n "$CMD_TO_RUN" ]; then\n\
57 | bash -c "$CMD_TO_RUN"\n\
58 | else\n\
59 | /bin/bash\n\
60 | fi' > /scripts/startup.sh && chmod +x /scripts/startup.sh
61 |
62 | ENTRYPOINT ["/bin/bash", "/scripts/startup.sh"]
63 |
--------------------------------------------------------------------------------
/train/scripts/run_sft.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # llama1-7b --> abs_path:
3 | # /home/daiyf/daiyf/HFmodel/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16
4 | # llama2-7b-chat --> abs_path:
5 | # /media/data1/fengduanyu/llama-2-7b-chat-T/
6 | export CUDA_VISIBLE_DEVICES='0,1,2,3' # Full
7 | export WANDB_PROJECT=CRA-llama2-7b-chat
8 | export WANDB_RUN_ID=CRA_0.045M
9 | export WANDB_RESUME=allow
10 | export ABS_PATH=""
11 | export PYTHONPATH="path_to/train"
12 | export WANDB_SERVER_PORT=10086
13 | export WANDB_API_KEY=WANDB_API_KEY
14 | model_name_or_path="path_to/llama-2-7b-chat-T"
15 |
16 | train_file="path_to/CRA-resample-train4w.json"
17 | validation_file="path_to/CRA-resample-dev3k.json"
18 | output_dir="$ABS_PATH/saved_models2/${WANDB_PROJECT}_${WANDB_RUN_ID}"
19 | mkdir -p ${output_dir}
20 |
21 | cache_dir="path_to/hf_cache_dir_2" # 代表缓存数据处理过程的路径
22 | log_dir="/path_to/train_log_dir_2"
23 | mkdir -p ${cache_dir}
24 | mkdir -p ${log_dir}
25 | cutoff_len=2048 # 最长输入序列长度(LLaMA模型建议设置为1024以上,Bloom模型设置为512以上)
26 | echo ${log_dir}
27 |
28 | # LoRA without 8bit
29 | nohup torchrun --nproc_per_node 2 src/entry_point/sft_train.py \
30 | --model_name_or_path ${model_name_or_path} \
31 | --bf16 True \
32 | --llama True \
33 | --use_lora True \
34 | --deepspeed configs/deepspeed_config_stage3.json \
35 | --lora_config configs/lora_config_llama.json \
36 | --train_file ${train_file} \
37 | --validation_file ${validation_file} \
38 | --per_device_train_batch_size 6 \
39 | --per_device_eval_batch_size 6 \
40 | --gradient_accumulation_steps 1 \
41 | --num_train_epochs 5 \
42 | --model_max_length ${cutoff_len} \
43 | --save_strategy "steps" \
44 | --save_total_limit 3 \
45 | --learning_rate 3e-4 \
46 | --weight_decay 0.00001 \
47 | --warmup_ratio 0.01 \
48 | --lr_scheduler_type "cosine" \
49 | --logging_steps 10 \
50 | --evaluation_strategy "steps" \
51 | --seed 1234 \
52 | --gradient_checkpointing \
53 | --cache_dir ${cache_dir} \
54 | --output_dir ${output_dir} \
55 | --overwrite_output_dir \
56 | > ${log_dir}/train.log 2>&1 &
57 | # --fp16 \
58 | # --resume_from_checkpoint ...
--------------------------------------------------------------------------------
/train/debug/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Text Generation
5 |
6 |
7 |
31 |
32 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/utils/ds_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 | GLOBAL_BATCH_SIZE = 32
6 | MICRO_BATCH_SIZE = 4
7 |
8 |
9 | def get_train_ds_config(offload,
10 | stage=2,
11 | enable_hybrid_engine=False,
12 | inference_tp_size=1,
13 | release_inference_cache=False,
14 | pin_parameters=True,
15 | tp_gather_partition_size=8):
16 |
17 | device = "cpu" if offload else "none"
18 | zero_opt_dict = {
19 | "stage": stage,
20 | "offload_param": {
21 | "device": device
22 | },
23 | "offload_optimizer": {
24 | "device": device
25 | },
26 | "stage3_param_persistence_threshold": 1e4,
27 | "stage3_max_live_parameters": 3e7,
28 | "stage3_prefetch_bucket_size": 3e7,
29 | "memory_efficient_linear": False
30 | }
31 | return {
32 | "train_batch_size": GLOBAL_BATCH_SIZE,
33 | "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
34 | "steps_per_print": 10,
35 | "zero_optimization": zero_opt_dict,
36 | "fp16": {
37 | "enabled": True,
38 | "loss_scale_window": 100
39 | },
40 | "gradient_clipping": 1.0,
41 | "prescale_gradients": False,
42 | "wall_clock_breakdown": False,
43 | "hybrid_engine": {
44 | "enabled": enable_hybrid_engine,
45 | "inference_tp_size": inference_tp_size,
46 | "release_inference_cache": release_inference_cache,
47 | "pin_parameters": pin_parameters,
48 | "tp_gather_partition_size": tp_gather_partition_size,
49 | }
50 | }
51 |
52 |
53 | def get_eval_ds_config(offload, stage=0):
54 | device = "cpu" if offload else "none"
55 | zero_opt_dict = {
56 | "stage": stage,
57 | "stage3_param_persistence_threshold": 1e4,
58 | "offload_param": {
59 | "device": device
60 | },
61 | "memory_efficient_linear": False
62 | }
63 | return {
64 | "train_batch_size": GLOBAL_BATCH_SIZE,
65 | "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
66 | "steps_per_print": 10,
67 | "zero_optimization": zero_opt_dict,
68 | "fp16": {
69 | "enabled": True
70 | },
71 | "gradient_clipping": 1.0,
72 | "prescale_gradients": False,
73 | "wall_clock_breakdown": False
74 | }
75 |
--------------------------------------------------------------------------------
/train/docker/docker_run.sh:
--------------------------------------------------------------------------------
1 | # 需要先安装container-toolkit
2 | # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
3 |
4 | export https_proxy=...
5 | export http_proxy=...
6 | export all_proxy=...
7 |
8 | belle_path=...
9 | docker_user=...
10 | tag=...
11 | hf_home="/.../.cache/huggingface"
12 | ssh_pub_key="/home/.../.ssh/id_rsa.pub"
13 | workdir="$belle_path/train"
14 | chown root:root $ssh_pub_key
15 |
16 | # docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
17 | # --network host \
18 | # --env HTTP_PROXY=$http_proxy \
19 | # --env HF_HOME=$hf_home \
20 | # -it --rm \
21 | # -v $belle_path:$belle_path \
22 | # -v $hf_home:$hf_home \
23 | # -v $ssh_pub_key:/root/.ssh/authorized_keys \
24 | # -w $workdir \
25 | # $docker_user/transformers:ds_$tag \
26 | # /bin/bash
27 |
28 | # 前台运行
29 | # docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
30 | # --network host \
31 | # --env https_proxy=$https_proxy \
32 | # --env http_proxy=$http_proxy \
33 | # --env all_proxy=$all_proxy \
34 | # --env HF_HOME=$hf_home \
35 | # -it --rm \
36 | # --name belle \
37 | # -v $belle_path:$belle_path \
38 | # -v $hf_home:$hf_home \
39 | # -v $ssh_pub_key:/root/.ssh/authorized_keys \
40 | # -w $workdir \
41 | # $docker_user/belle:$tag \
42 | # --sshd_port 2201 --cmd "echo 'export https_proxy=$https_proxy' >> /root/.bashrc && \
43 | # echo 'export http_proxy=$http_proxy' >> /root/.bashrc && \
44 | # echo 'export all_proxy=$all_proxy' >> /root/.bashrc && \
45 | # echo 'export HF_HOME=$hf_home' >> /root/.bashrc && \
46 | # /bin/bash"
47 |
48 | # 后台运行
49 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
50 | --network host \
51 | --env https_proxy=$https_proxy \
52 | --env http_proxy=$http_proxy \
53 | --env all_proxy=$all_proxy \
54 | --env HF_HOME=$hf_home \
55 | -d --rm \
56 | --name belle \
57 | -v $belle_path:$belle_path \
58 | -v $hf_home:$hf_home \
59 | -v $ssh_pub_key:/root/.ssh/authorized_keys \
60 | -w $workdir \
61 | $docker_user/belle:$tag \
62 | --sshd_port 2201 --cmd "echo 'export https_proxy=$https_proxy' >> /root/.bashrc && \
63 | echo 'export http_proxy=$http_proxy' >> /root/.bashrc && \
64 | echo 'export all_proxy=$all_proxy' >> /root/.bashrc && \
65 | echo 'export HF_HOME=$hf_home' >> /root/.bashrc && \
66 | sleep infinity"
--------------------------------------------------------------------------------
/train/src/entry_point/inference.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoConfig
4 | from peft import PeftModel
5 | import argparse
6 | from tqdm import tqdm
7 | import json, os
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--model_name_or_path', type=str, required=True)
10 | parser.add_argument('--ckpt_path', type=str, required=True)
11 | parser.add_argument('--use_lora', action="store_true")
12 | parser.add_argument('--llama', action="store_true")
13 | args = parser.parse_args()
14 |
15 |
16 | max_new_tokens = 1024
17 | generation_config = dict(
18 | temperature=0.001,
19 | top_k=30,
20 | top_p=0.85,
21 | do_sample=True,
22 | num_beams=1,
23 | repetition_penalty=1.2,
24 | max_new_tokens=max_new_tokens
25 | )
26 |
27 | instruction_list = [
28 | "Human: \n小明有12个橙子,他想把它们分给他的4个朋友,每人分到的橙子数量相同,每人能分到几个橙子?\n\nAssistant:\n",
29 | "Human: \n以下是一道小学数学题:小明家里有 3 只宠物猫和 2 只宠物狗,小花家里有 4 只宠物猫和 1 只宠物狗,谁家里宠物更多?\n\nAssistant:\n",
30 | "Human: \n题目:小明有5个球,他送给小红2个球,还剩多少个球?\n\nAssistant:\n",
31 | "Human: \n请问2+3等于几?\n\nAssistant:\n"
32 | ]
33 |
34 |
35 | if __name__ == '__main__':
36 | load_type = torch.float16 #Sometimes may need torch.float32
37 | if torch.cuda.is_available():
38 | device = torch.device(0)
39 | else:
40 | device = torch.device('cpu')
41 |
42 | if args.llama:
43 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path)
44 | else:
45 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
46 |
47 | tokenizer.pad_token_id = 0
48 | tokenizer.bos_token_id = 1
49 | tokenizer.eos_token_id = 2
50 | tokenizer.padding_side = "left"
51 | model_config = AutoConfig.from_pretrained(args.model_name_or_path)
52 |
53 | if args.use_lora:
54 | base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=load_type)
55 | model = PeftModel.from_pretrained(base_model, args.ckpt_path, torch_dtype=load_type)
56 | else:
57 | model = AutoModelForCausalLM.from_pretrained(args.ckpt_path, torch_dtype=load_type, config=model_config)
58 |
59 | if device==torch.device('cpu'):
60 | model.float()
61 |
62 | model.to(device)
63 | model.eval()
64 | print("Load model successfully")
65 |
66 | for instruction in instruction_list:
67 | inputs = tokenizer(instruction, max_length=max_new_tokens,truncation=True,return_tensors="pt")
68 | generation_output = model.generate(
69 | input_ids = inputs["input_ids"].to(device),
70 | **generation_config
71 | )[0]
72 |
73 | generate_text = tokenizer.decode(generation_output,skip_special_tokens=True)
74 | print(generate_text)
75 | print("-"*100)
76 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/utils/model/model_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 | import os
6 | import math
7 | import torch
8 | from transformers import (
9 | AutoConfig,
10 | AutoModel,
11 | )
12 |
13 | from transformers.deepspeed import HfDeepSpeedConfig
14 |
15 | # from .reward_model import RewardModel
16 |
17 |
18 | def create_hf_model(model_class,
19 | model_name_or_path,
20 | tokenizer,
21 | ds_config=None,
22 | rlhf_training=False):
23 | model_config = AutoConfig.from_pretrained(model_name_or_path)
24 | model_config.dropout = 0.0
25 | # Note: dschf is defined in function scope to avoid global effects
26 | # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration
27 | if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
28 | dschf = HfDeepSpeedConfig(ds_config)
29 | else:
30 | dschf = None
31 | if rlhf_training:
32 | # the weight loading is handled by create critic model
33 | model = model_class.from_config(model_config)
34 | else:
35 | model = model_class.from_pretrained(
36 | model_name_or_path,
37 | from_tf=bool(".ckpt" in model_name_or_path),
38 | config=model_config)
39 |
40 | # model.config.end_token_id = tokenizer.eos_token_id
41 | # model.config.pad_token_id = model.config.eos_token_id
42 | # model.resize_token_embeddings(int(8 *math.ceil(len(tokenizer) / 8.0))) # make the vocab size multiple of 8
43 | return model
44 |
45 |
46 | # def create_critic_model(model_name_or_path,
47 | # tokenizer,
48 | # ds_config,
49 | # num_padding_at_beginning=0,
50 | # rlhf_training=False):
51 | # # OPT model family always put a padding token at the beginning of the sequence,
52 | # # we did not see this in other models but not sure if it is a general rule
53 | # critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer,
54 | # ds_config, rlhf_training)
55 | # critic_model = RewardModel(
56 | # critic_model,
57 | # tokenizer,
58 | # num_padding_at_beginning=num_padding_at_beginning)
59 |
60 | # if rlhf_training:
61 | # # critic model needs to load the weight here
62 | # model_ckpt_path = os.path.join(model_name_or_path, 'pytorch_model.bin')
63 | # assert os.path.exists(
64 | # model_ckpt_path
65 | # ), f"Cannot find model checkpoint at {model_ckpt_path}"
66 | # critic_model.load_state_dict(
67 | # torch.load(model_ckpt_path, map_location='cpu'))
68 |
69 | # return critic_model
70 |
--------------------------------------------------------------------------------
/train/docker/transformers.dockerfile:
--------------------------------------------------------------------------------
1 | # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
2 | FROM nvcr.io/nvidia/pytorch:22.12-py3
3 | LABEL maintainer="Hugging Face"
4 |
5 | ARG DEBIAN_FRONTEND=noninteractive
6 |
7 | ARG PYTORCH='2.0.1'
8 | # Example: `cu102`, `cu113`, etc.
9 | ARG CUDA='cu118'
10 |
11 | RUN apt -y update
12 | RUN apt install -y libaio-dev
13 | RUN python3 -m pip install --no-cache-dir --upgrade pip
14 |
15 | ARG REF=main
16 | RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
17 |
18 | RUN python3 -m pip uninstall -y torch torchvision torchaudio
19 |
20 | # Install latest release PyTorch
21 | # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
22 | # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
23 | RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
24 |
25 | RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
26 |
27 | RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
28 |
29 | # Uninstall `transformer-engine` shipped with the base image
30 | RUN python3 -m pip uninstall -y transformer-engine
31 |
32 | # Uninstall `torch-tensorrt` shipped with the base image
33 | RUN python3 -m pip uninstall -y torch-tensorrt
34 |
35 | # recompile apex
36 | RUN python3 -m pip uninstall -y apex
37 | RUN git clone https://github.com/NVIDIA/apex
38 | # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
39 | RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
40 |
41 | # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
42 | RUN python3 -m pip uninstall -y deepspeed
43 | # This has to be run (again) inside the GPU VMs running the tests.
44 | # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
45 | # TODO: Find out why test fail.
46 | RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
47 |
48 | # When installing in editable mode, `transformers` is not recognized as a package.
49 | # this line must be added in order for python to be aware of transformers.
50 | RUN cd transformers && python3 setup.py develop
51 |
52 | # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
53 | RUN python3 -m pip install -U --no-cache-dir "pydantic<2"
54 | RUN python3 -c "from deepspeed.launcher.runner import main"
55 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
2 | FROM nvcr.io/nvidia/pytorch:22.08-py3
3 | LABEL maintainer="BELLE"
4 |
5 | ARG DEBIAN_FRONTEND=noninteractive
6 |
7 | ARG PYTORCH='2.0.0'
8 | # Example: `cu102`, `cu113`, etc.
9 | ARG CUDA='cu117'
10 |
11 | RUN apt -y update
12 | RUN apt install -y libaio-dev
13 | RUN python3 -m pip install --no-cache-dir --upgrade pip
14 | RUN python3 -m pip install -U pip
15 | RUN python3 -m pip config set global.index-url http://mirrors.aliyun.com/pypi/simple
16 | RUN python3 -m pip config set install.trusted-host mirrors.aliyun.com
17 |
18 | ARG REF=main
19 | RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
20 |
21 | # Install latest release PyTorch
22 | # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
23 | # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
24 | RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
25 |
26 | RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
27 |
28 | # Uninstall `torch-tensorrt` shipped with the base image
29 | RUN python3 -m pip uninstall -y torch-tensorrt
30 |
31 | # recompile apex
32 | RUN python3 -m pip uninstall -y apex
33 | RUN git clone https://github.com/NVIDIA/apex
34 | # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
35 | RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
36 |
37 | # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
38 | RUN python3 -m pip uninstall -y deepspeed
39 | # This has to be run (again) inside the GPU VMs running the tests.
40 | # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
41 | # TODO: Find out why test fail.
42 | RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
43 |
44 | # When installing in editable mode, `transformers` is not recognized as a package.
45 | # this line must be added in order for python to be aware of transformers.
46 | RUN cd transformers && python3 setup.py develop
47 |
48 | # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
49 | RUN python3 -m pip install -U --no-cache-dir pydantic
50 | RUN python3 -c "from deepspeed.launcher.runner import main"
51 |
52 | #customer
53 | RUN python3 -m pip install datasets>=2.8.0
54 | RUN python3 -m pip install sentencepiece>=0.1.97
55 | RUN python3 -m pip install protobuf==3.20.3
56 | RUN python3 -m pip install accelerate>=0.15.0
57 |
--------------------------------------------------------------------------------
/train/docs/FAQ.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | 这里给出一些实验过程中遇到的问题以及供参考的解决方案,同时对问题大致地分类
4 |
5 | **解决方案仅供参考,未必能彻底解决对应问题!!!**
6 |
7 | **解决方案仅供参考,未必能彻底解决对应问题!!!**
8 |
9 | **解决方案仅供参考,未必能彻底解决对应问题!!!**
10 |
11 | ### Deepspeed相关
12 |
13 | | 报错信息 | 参考 |
14 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
15 | | RuntimeError: CUDA error: an illegal memory access was encountered.CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect | https://github.com/pytorch/pytorch/issues/21819 |
16 | | RuntimeError: Error building extension 'fused_adam' | sudo ln -s /usr/local/cuda/lib64/libcudart.so /usr/lib/libcudart.so |
17 | | RuntimeError: expected scalar type Float but found Half | use_int8_training和deepspeed不能同时指定 |
18 | | RuntimeError: expected scalar type Float but found Half | V100显卡上 use_int8_training和fp16不能同时指定 |
19 |
20 | ### transformers相关
21 |
22 | | 报错信息 | 参考 |
23 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
24 | | AutoTokenizer.from_pretrained("llama_model_path")出现递归error
RecursionError: maximum recursion depth exceeded | 有可能是transformers版本的问题,对于LlamaModel,可采用LlamaTokenizer加载 |
25 | | torch.distributed.distributed_c10d.init_process_group() got multiple values for keyword argument 'backend' | transformers降低版本至4.28.1 |
26 | | | |
27 |
28 | ### 其他问题
29 |
30 | | 报错信息 | 参考 |
31 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
32 | | V100机器上8bit量化训练失败或loss不正常 | https://github.com/Facico/Chinese-Vicuna/issues/39
https://github.com/TimDettmers/bitsandbytes/issues/100
https://github.com/mymusise/ChatGLM-Tuning/issues/19
https://github.com/tloen/alpaca-lora/issues/170 |
33 | | | |
34 | | huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': . Use `repo_type` argument if needed. | 这是因为docker容器内访问不到model_name_or_path,需要挂载到物理机对应的目录。 |
35 |
36 |
37 |
38 | 这里给出一些实验建议:
39 |
40 | 1. 不开deepspeed会占用更多显存,建议全量参数finetune模式尽可能采用deepspeed
41 | 2. LoRA训练如果采用8bit量化,就不能使用deepspeed;如果使用deepspeed,就不能指定use_int8_training
42 |
43 | 关于deepspeed的配置可参考:
44 |
45 | 1. https://github.com/microsoft/DeepSpeed/issues/2187
46 | 2. https://www.deepspeed.ai/tutorials/advanced-install/
47 | 3. https://github.com/pyg-team/pytorch_geometric/issues/1001
48 |
49 |
--------------------------------------------------------------------------------
/train/debug/templates/output.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Token Probabilities
5 |
6 |
71 |
72 |
73 | {% for item in tokens_with_probs %}
74 |
75 | {{ item.token }}
76 | {{ '%.2f'|format(item.prob*100) }}%
77 |
78 | {% endfor %}
79 |
80 |
81 |
82 | {% for i in range(11) %}
83 | {{ i / 10 }}
84 | {% endfor %}
85 |
86 |
87 |
88 |
89 | Loss: {{ loss }}
90 |
91 |
92 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/train/src/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Any, List, Union
3 | from gradio_client import Client
4 | from tqdm import tqdm
5 | from transformers.deepspeed import is_deepspeed_zero3_enabled
6 | from deepspeed.runtime.engine import DeepSpeedEngine
7 | import torch
8 |
9 |
10 | def get_ds_state_dict(ds_engine: DeepSpeedEngine):
11 | """
12 | 如果是zero stage 3,要对所有rank调用,无视掉stage3_gather_16bit_weights_on_model_save参数
13 | """
14 | if ds_engine.zero_optimization_partition_weights():
15 | # consolidation is expensive in time and memory and therefore isn't a default
16 | state_dict = ds_engine._zero3_consolidated_16bit_state_dict()
17 | else:
18 | state_dict = ds_engine.module.state_dict()
19 | return state_dict
20 |
21 |
22 | def get_model_param_count(model: Union[DeepSpeedEngine, torch.nn.Module], trainable_only=False):
23 | """
24 | Calculate model's total param count. If trainable_only is True then count only those requiring grads
25 | """
26 | if is_deepspeed_zero3_enabled() and isinstance(model, DeepSpeedEngine):
27 | def numel(p):
28 | return p.ds_numel
29 |
30 | else:
31 | def numel(p):
32 | return p.numel()
33 |
34 | return sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad)
35 |
36 |
37 | class MultiClient(object):
38 | def __init__(self, worker_addrs) -> None:
39 | self.clients = [Client(addr) for addr in worker_addrs]
40 |
41 | def predict(self, tasks: List[List], max_retries: int = 3) -> List[Any]:
42 | pbar = tqdm(total=len(tasks))
43 | jobs = {
44 | client: (i, client.submit(*(tasks[i]), api_name="/predict"))
45 | for i, client in enumerate(self.clients)
46 | if i < len(tasks)
47 | }
48 | results = {}
49 | retries = {i: 0 for i in range(len(tasks))}
50 |
51 | while jobs:
52 | for client, (i, job) in list(jobs.items()):
53 | if job.done():
54 | pbar.update(1)
55 | del jobs[client]
56 | try:
57 | result = job.result()
58 | results[i] = result
59 | except Exception as e:
60 | print("Job failed with error:", e)
61 | if retries[i] < max_retries:
62 | print("Retrying job...")
63 | retries[i] += 1
64 | new_job = client.submit(
65 | *tasks[i], api_name="/predict")
66 | jobs[client] = (i, new_job)
67 | continue # Skip the rest of the loop
68 | else:
69 | results[i] = None
70 |
71 | if tasks:
72 | new_i = len(results) + len(jobs)
73 | if new_i < len(tasks):
74 | new_task = tasks[new_i]
75 | new_job = client.submit(
76 | *new_task, api_name="/predict")
77 | jobs[client] = (new_i, new_job)
78 | time.sleep(0.1)
79 | pbar.close()
80 |
81 | predicts = [results[i] for i in sorted(results)]
82 |
83 | return predicts
84 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/FAQ.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | **我们的机器配置是8卡A100 40G,所以下面的问题也只针对8卡A100 40G的配置**
4 |
5 | LLaMA模型的max_seq_len通常设置为1024或2048
6 |
7 | Bloomz模型的max_seq_len通常设置为512或1024
8 |
9 | ## 1. 单机单卡可以训练多大参数量的模型
10 |
11 | 现在的deepspeed-chat项目还不支持offload,目前实验来看,在max_seq_len设置为1024的情况下,单机单卡(40G)上微调的模型参数量应该最多也就是bloomz-1b1。(总结的并不严谨,欢迎大家指正)
12 |
13 | 如果想要微调bloomz-1b7模型,可能需要将max_seq_len设置的很小。
14 |
15 |
16 |
17 | ## 2. 单机多卡可以训练多大参数量的模型
18 |
19 | 目前来看,可以跑起来7b1的bloomz和7B的LLaMA。
20 |
21 |
22 |
23 | ## 3. 单机单卡采用LoRA可以训练多大参数量的模型
24 |
25 | 可以训练7B的LLaMA。而对于7b1的bloom,需要将max_seq_len设置小一些。
26 |
27 |
28 |
29 | ## 4. 单机多卡采用LoRA可以训练多大参数量的模型
30 |
31 | 可以训练13B的模型。目前还没在8卡A100 40G上尝试过更大参数量的模型。欢迎大家一起交流。
32 |
33 | ## 5. 加载Llama tokenizer时存在的问题
34 |
35 | Llama的tokenizer初始没有pad_token_id,需要赋值
36 |
37 | 实验过程中发现,不同的transformers版本在加载Llama词表时会出现一些问题,记录如下:
38 |
39 | | transformers版本 | 问题 |
40 | | ---------------- | ------------------------------------------------------------ |
41 | | 4.28.0.dev0 | 当前版本可正常加载tokenizer,unk_token_id=0, bos_token_id=1, eos_token_id=2 |
42 | | 4.28.1 | AutoTokenizer.from_pretrained会出现RecursionError: maximum recursion depth exceeded,需要用LlamaTokenizer
eos_token_id,bos_token_id,unk_token_id都是0 |
43 | | 4.29.0.dev0 | 该版本与4.28.1存在同样的问题 |
44 |
45 | 目前统一解决办法是,如果模型是llama,则做如下赋值操作:
46 |
47 | ```bash
48 | tokenizer.pad_token_id = 0
49 | tokenizer.bos_token_id = 1
50 | tokenizer.eos_token_id = 2
51 | ```
52 |
53 | ## 6. 加载2M的数据量需要多大的内存和多长时间
54 |
55 | 对于200万的数据量,通过观察,大概要350G的内存,加载时长大概在25min左右(这是Bloom的时长,如果是Llama,tokenize的时间会加长)
56 |
57 | 我们目前尚未对加载数据部分的代码做优化,包括内存和时长。
58 |
59 | ## 7. 训练模型的生成结果非常糟糕
60 |
61 | 这里的糟糕指的是生成的结果形如:“我们用减法计算出小明还剩下多少个鸡蛋多少个鸡蛋多少个鸡蛋多少个鸡蛋个鸡蛋个鸡蛋减法计算减法计算蛋个鸡蛋”
62 |
63 | 就目前我们的实验经验来看,出现这个问题的主要原因出在这几个特殊的token_id上,尤其是pad_token_id和eos_token_id的值,要确保两者不相等,而且pad_token_id=0, eos_token_id=2。(不区分LLaMA和Bloom)
64 |
65 | ## Others
66 |
67 | 这里我们提供了在实验过程中遇到的一些报错的情况,并提供了参考的解决方案。(**注:参考方案未必一定能够解决对应的问题**)
68 |
69 | | 报错信息 | 参考 |
70 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
71 | | RuntimeError: CUDA error: an illegal memory access was encountered.CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect | https://github.com/pytorch/pytorch/issues/21819 |
72 | | AutoTokenizer.from_pretrained("llama_model_path")出现递归error
RecursionError: maximum recursion depth exceeded | 有可能是transformers版本的问题,对于LlamaModel,可采用LlamaTokenizer加载 |
73 | | xx>=0.11.0 is required for a normal functioning of this module, but found xx==0.10.0 | 这是因为版本不匹配导致的问题,可按照报错信息安装要求的版本即可 |
74 | | torch.distributed.distributed_c10d.init_process_group() got multiple values for keyword argument 'backend' | transformers降低版本至4.28.1 |
75 | | RuntimeError: Error building extension 'fused_adam' | sudo ln -s /usr/local/cuda/lib64/libcudart.so /usr/lib/libcudart.so |
76 | | huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': . Use `repo_type` argument if needed. | 这是因为docker容器内访问不到model_name_or_path,需要挂载到物理机对应的目录。 |
--------------------------------------------------------------------------------
/train/scripts/run_pt.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
3 | export WANDB_PROJECT=...
4 | export WANDB_RUN_ID=...
5 | export WANDB_RESUME=allow
6 | export ABS_PATH=...
7 | export PYTHONPATH="$ABS_PATH/BELLE/train"
8 | model_name_or_path=/path_to_llm/hf_llama_7b/ # or bloomz-7b1-mt
9 |
10 | train_file=belleMath.json
11 | validation_file=belleMath-dev1K.json
12 | output_dir="$ABS_PATH/saved_models/${WANDB_PROJECT}_${WANDB_RUN_ID}"
13 | mkdir -p ${output_dir}
14 |
15 | cache_dir=hf_cache_dir
16 | mkdir -p ${cache_dir}
17 | cutoff_len=1024
18 |
19 | #FT
20 | # torchrun --nproc_per_node 8 src/entry_point/pt_train.py \
21 | # --ddp_timeout 36000 \
22 | # --model_name_or_path ${model_name_or_path} \
23 | # --llama \
24 | # --deepspeed configs/deepspeed_config.json \
25 | # --train_file ${train_file} \
26 | # --validation_file ${validation_file} \
27 | # --per_device_train_batch_size 2 \
28 | # --per_device_eval_batch_size 2 \
29 | # --gradient_accumulation_steps 4 \
30 | # --num_train_epochs 2 \
31 | # --model_max_length ${cutoff_len} \
32 | # --save_strategy "steps" \
33 | # --save_total_limit 3 \
34 | # --learning_rate 8e-6 \
35 | # --weight_decay 0.00001 \
36 | # --warmup_ratio 0.05 \
37 | # --lr_scheduler_type "cosine" \
38 | # --logging_steps 10 \
39 | # --evaluation_strategy "steps" \
40 | # --bf16 \
41 | # --seed 1234 \
42 | # --gradient_checkpointing \
43 | # --cache_dir ${cache_dir} \
44 | # --output_dir ${output_dir} \
45 | # # --resume_from_checkpoint ...
46 |
47 |
48 | #LoRA with 8bit
49 | # torchrun --nproc_per_node 8 src/entry_point/pt_train.py \
50 | # --ddp_timeout 36000 \
51 | # --model_name_or_path ${model_name_or_path} \
52 | # --llama \
53 | # --use_lora \
54 | # --use_int8_training \
55 | # --lora_config configs/lora_config_llama.json \
56 | # --train_file ${train_file} \
57 | # --validation_file ${validation_file} \
58 | # --per_device_train_batch_size 1 \
59 | # --per_device_eval_batch_size 1 \
60 | # --gradient_accumulation_steps 8 \
61 | # --num_train_epochs 2 \
62 | # --model_max_length ${cutoff_len} \
63 | # --save_strategy "steps" \
64 | # --save_total_limit 3 \
65 | # --learning_rate 8e-6 \
66 | # --weight_decay 0.00001 \
67 | # --warmup_ratio 0.05 \
68 | # --lr_scheduler_type "cosine" \
69 | # --logging_steps 10 \
70 | # --evaluation_strategy "steps" \
71 | # --bf16 \
72 | # --seed 1234 \
73 | # --gradient_checkpointing \
74 | # --cache_dir ${cache_dir} \
75 | # --output_dir ${output_dir} \
76 | # # --resume_from_checkpoint ...
77 |
78 | # LoRA without 8bit
79 | torchrun --nproc_per_node 8 src/entry_point/pt_train.py \
80 | --ddp_timeout 36000 \
81 | --model_name_or_path ${model_name_or_path} \
82 | --llama \
83 | --use_lora \
84 | --deepspeed configs/deepspeed_config_stage3.json \
85 | --lora_config configs/lora_config_llama.json \
86 | --train_file ${train_file} \
87 | --validation_file ${validation_file} \
88 | --per_device_train_batch_size 1 \
89 | --per_device_eval_batch_size 1 \
90 | --gradient_accumulation_steps 1 \
91 | --num_train_epochs 10 \
92 | --model_max_length ${cutoff_len} \
93 | --save_strategy "steps" \
94 | --save_total_limit 3 \
95 | --learning_rate 3e-4 \
96 | --weight_decay 0.00001 \
97 | --warmup_ratio 0.01 \
98 | --lr_scheduler_type "cosine" \
99 | --logging_steps 10 \
100 | --evaluation_strategy "steps" \
101 | --bf16 \
102 | --seed 1234 \
103 | --gradient_checkpointing \
104 | --cache_dir ${cache_dir} \
105 | --output_dir ${output_dir} \
106 | # --resume_from_checkpoint ...
107 |
--------------------------------------------------------------------------------
/train/scripts/run_multi_backend.py:
--------------------------------------------------------------------------------
1 | import fcntl
2 | import subprocess
3 | import os
4 | import signal
5 | import sys
6 | import time
7 | from transformers import HfArgumentParser
8 | from dataclasses import dataclass, field
9 |
10 | cuda_devices = os.getenv("CUDA_VISIBLE_DEVICES", "")
11 | if cuda_devices == "":
12 | num_process = 1
13 | cudas = ['']
14 | else:
15 | cudas = [cuda.strip() for cuda in cuda_devices.split(",")]
16 | num_process = len(cudas)
17 |
18 | @dataclass
19 | class Arguments:
20 | command: str = field(metadata={'help': 'command'})
21 |
22 | args = HfArgumentParser((Arguments)).parse_args_into_dataclasses()[0]
23 |
24 | processes = []
25 | outputs = {} # 用于存储子进程的输出
26 |
27 | # ANSI 转义序列颜色代码
28 | COLORS = [
29 | "\033[31m", # 红色
30 | "\033[32m", # 绿色
31 | "\033[33m", # 黄色
32 | "\033[34m", # 蓝色
33 | "\033[35m", # 紫色
34 | "\033[36m", # 青色
35 | "\033[91m", # 浅红色
36 | "\033[92m", # 浅绿色
37 | "\033[93m", # 浅黄色
38 | "\033[94m", # 浅蓝色
39 | "\033[95m", # 浅紫色
40 | "\033[96m", # 浅青色
41 | ]
42 |
43 | def set_non_blocking(file):
44 | fd = file.fileno()
45 | flags = fcntl.fcntl(fd, fcntl.F_GETFL)
46 | fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
47 |
48 | def terminate_processes_and_exit(exit_code=0):
49 | print("终止子进程...")
50 | for process in processes:
51 | process.terminate()
52 |
53 | # 等待子进程终止
54 | for process in processes:
55 | process.wait()
56 |
57 | # 打印子进程的输出
58 | for local_rank, output in outputs.items():
59 | color_code = COLORS[local_rank % len(COLORS)] # 根据进程编号选择颜色
60 | print(f"{color_code}rank: {local_rank} stdout: {output['stdout'].decode()}\033[0m")
61 | print(f"{color_code}rank: {local_rank} stderr: {output['stderr'].decode()}\033[0m")
62 |
63 | # 退出主进程
64 | sys.exit(exit_code)
65 |
66 | def handle_termination(signal, frame):
67 | print("收到终止信号,终止子进程...")
68 | terminate_processes_and_exit()
69 |
70 | signal.signal(signal.SIGINT, handle_termination)
71 | signal.signal(signal.SIGTERM, handle_termination)
72 |
73 | # 获取父进程的环境变量
74 | parent_env = os.environ.copy()
75 | print(f'cudas: {cudas}')
76 |
77 | for local_rank in range(num_process):
78 | # 设置新的环境变量
79 | env = parent_env.copy()
80 | env['CUDA_VISIBLE_DEVICES'] = f'{cudas[local_rank]}'
81 | command = f'{args.command} --local_rank {local_rank}'
82 | print(command)
83 | process = subprocess.Popen(
84 | command,
85 | shell=True,
86 | env=env,
87 | stdout=subprocess.PIPE,
88 | stderr=subprocess.PIPE,
89 | )
90 | processes.append(process)
91 | outputs[local_rank] = {
92 | "stdout": b"",
93 | "stderr": b"",
94 | }
95 |
96 | for process in processes:
97 | set_non_blocking(process.stdout)
98 | set_non_blocking(process.stderr)
99 |
100 | try:
101 | while True:
102 | for local_rank, process in enumerate(processes):
103 | stdout = process.stdout.read()
104 | stderr = process.stderr.read()
105 |
106 | if stdout:
107 | new_output = stdout[len(outputs[local_rank]["stdout"]) :]
108 | outputs[local_rank]["stdout"] += new_output
109 | color_code = COLORS[local_rank % len(COLORS)] # 根据进程编号选择颜色
110 | print(f"{color_code}rank: {local_rank} stdout: {new_output.decode()}\033[0m")
111 |
112 | if stderr:
113 | new_output = stderr[len(outputs[local_rank]["stderr"]) :]
114 | outputs[local_rank]["stderr"] += new_output
115 | color_code = COLORS[local_rank % len(COLORS)] # 根据进程编号选择颜色
116 | print(f"{color_code}rank: {local_rank} stderr: {new_output.decode()}\033[0m")
117 |
118 | processes = [process for process in processes if process.poll() is None]
119 |
120 | if not processes:
121 | break
122 |
123 | time.sleep(0.1)
124 | except Exception as e:
125 | print("主脚本发生异常:", str(e))
126 | terminate_processes_and_exit(1)
127 |
128 |
--------------------------------------------------------------------------------
/train/scripts/debug.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
3 | export CUDA_VISIBLE_DEVICES='0,1'
4 | export WANDB_PROJECT=debug
5 | export WANDB_RUN_ID=debug
6 | export WANDB_RESUME=allow
7 | export PYTHONPATH='/data/hanweiguang/Projects/BELLE/train'
8 |
9 | # model_name_or_path="decapoda-research/llama-7b-hf"
10 | model_name_or_path="bigscience/bloomz-560m"
11 |
12 | train_file=/data/hanweiguang/Projects/BELLE/data/test_data/test_pt.jsonl
13 | validation_file=/data/hanweiguang/Projects/BELLE/data/test_data/test_pt.jsonl
14 | output_dir=/data/hanweiguang/Projects/BELLE/saved_models/${WANDB_PROJECT}_${WANDB_RUN_ID}
15 | rm -rf $output_dir
16 | mkdir -p ${output_dir}
17 |
18 | cache_dir=hf_cache_dir
19 | mkdir -p ${cache_dir}
20 | cutoff_len=32
21 |
22 |
23 | # FT
24 | torchrun --nproc_per_node 2 src/entry_point/pt_train.py \
25 | --model_name_or_path ${model_name_or_path} \
26 | --deepspeed configs/deepspeed_config.json \
27 | --train_file ${train_file} \
28 | --validation_file ${validation_file} \
29 | --per_device_train_batch_size 1 \
30 | --per_device_eval_batch_size 1 \
31 | --gradient_accumulation_steps 1 \
32 | --num_train_epochs 1 \
33 | --model_max_length ${cutoff_len} \
34 | --save_strategy "steps" \
35 | --save_total_limit 3 \
36 | --learning_rate 1e-7 \
37 | --weight_decay 0.00001 \
38 | --warmup_ratio 0.05 \
39 | --lr_scheduler_type "cosine" \
40 | --logging_steps 10 \
41 | --evaluation_strategy "steps" \
42 | --seed 1234 \
43 | --gradient_checkpointing \
44 | --cache_dir ${cache_dir} \
45 | --output_dir ${output_dir} \
46 | --report_to "tensorboard" \
47 | --bf16 \
48 | # --fp16 \
49 | # --llama
50 |
51 | # debug lora single node multiple gpus
52 | # torchrun --nproc_per_node 2 --rdzv-endpoint "127.0.0.1:30012" \
53 | # "src/entry_point/pt_train.py" \
54 | # --model_name_or_path ${model_name_or_path} \
55 | # --train_file ${train_file} \
56 | # --validation_file ${validation_file} \
57 | # --per_device_train_batch_size 1 \
58 | # --per_device_eval_batch_size 1 \
59 | # --gradient_accumulation_steps 1 \
60 | # --num_train_epochs 1 \
61 | # --model_max_length ${cutoff_len} \
62 | # --learning_rate 3e-4 \
63 | # --weight_decay 0.00001 \
64 | # --warmup_ratio 0.01 \
65 | # --lr_scheduler_type "cosine" \
66 | # --logging_steps 10 \
67 | # --save_strategy "steps" \
68 | # --save_total_limit 1 \
69 | # --evaluation_strategy "steps" \
70 | # --seed 1234 \
71 | # --gradient_checkpointing \
72 | # --cache_dir ${cache_dir} \
73 | # --output_dir ${output_dir} \
74 | # --report_to tensorboard \
75 | # --use_lora \
76 | # --lora_config configs/lora_config_bloom.json \
77 | # --deepspeed configs/deepspeed_config_stage3.json \
78 | # --bf16 \
79 | # --fp16 \
80 | # --use_int8_training \
81 | # # --resume_from_checkpoint "/data/hanweiguang/Projects/BELLE/saved_models/boxue_debug_debug/checkpoint-5" \
82 | # # --llama \
83 |
84 | # master_addr='10.201.102.66'
85 | # master_port='65530'
86 |
87 | # # debug lora multiple nodes
88 | # torchrun \
89 | # --nproc_per_node 1 \
90 | # --nnode 2 \
91 | # --node_rank $1 \
92 | # --master_addr $master_addr \
93 | # --master_port $master_port \
94 | # 'src/entry_point/pt_train.py' \
95 | # --model_name_or_path ${model_name_or_path} \
96 | # --train_file ${train_file} \
97 | # --validation_file ${validation_file} \
98 | # --per_device_train_batch_size 1 \
99 | # --per_device_eval_batch_size 1 \
100 | # --gradient_accumulation_steps 1 \
101 | # --num_train_epochs 40 \
102 | # --model_max_length ${cutoff_len} \
103 | # --learning_rate 3e-4 \
104 | # --weight_decay 0.00001 \
105 | # --warmup_ratio 0.01 \
106 | # --lr_scheduler_type "cosine" \
107 | # --logging_steps 10 \
108 | # --save_strategy "steps" \
109 | # --save_total_limit 3 \
110 | # --evaluation_strategy "steps" \
111 | # --fp16 \
112 | # --seed 1234 \
113 | # --gradient_checkpointing \
114 | # --cache_dir ${cache_dir} \
115 | # --output_dir ${output_dir} \
116 | # --deepspeed configs/deepspeed_config_stage3.json \
117 | # --lora_config configs/lora_config_bloom.json \
118 | # --use_lora \
119 | # # --llama \
--------------------------------------------------------------------------------
/train/src/sample_generator.py:
--------------------------------------------------------------------------------
1 | from itertools import chain
2 | from typing import Any, Dict, List
3 | import pudb
4 | import copy
5 | from transformers import PreTrainedTokenizer
6 | import json
7 |
8 | IGNORE_INDEX = -100
9 |
10 |
11 | def generate_and_tokenize_prompt(
12 | model_max_length: int,
13 | tokenizer: PreTrainedTokenizer,
14 | data_point: Dict[str, Any],
15 | fix_length=False,
16 | padding_side="left",
17 | ):
18 | input_ids = []
19 | labels = []
20 | source = data_point["conversations"]
21 | for sentence in source:
22 | sentence_from = sentence["from"].lower()
23 | sentence_value = (
24 | "Human: \n" + sentence["value"] + "\n\nAssistant: \n"
25 | if sentence_from == "human"
26 | else sentence["value"]
27 | ) # https://github.com/LianjiaTech/BELLE/issues/337
28 | # conversation += sentence_value
29 | sentence_ids = tokenizer.encode(
30 | sentence_value, add_special_tokens=False
31 | ) # do not add bos_token_id
32 | label = (
33 | copy.deepcopy(sentence_ids)
34 | if sentence_from != "human"
35 | else [IGNORE_INDEX] * len(sentence_ids)
36 | )
37 | input_ids += sentence_ids
38 | labels += label
39 | # add eos at every end of assistant sentence
40 | if sentence_from != "human":
41 | input_ids += [tokenizer.eos_token_id] # make sure eos_token_id is correct
42 | labels += [tokenizer.eos_token_id]
43 |
44 | input_ids = input_ids[:model_max_length]
45 | labels = labels[:model_max_length]
46 |
47 | if all(x == IGNORE_INDEX for x in labels):
48 | labels[18:24] = input_ids[
49 | 18:24
50 | ] # labels can not have all values being -100. 18 and 24 are just random numbers
51 | attention_mask = [1] * len(input_ids)
52 |
53 | if fix_length:
54 | if padding_side == "left":
55 | input_ids = [tokenizer.pad_token_id] * (
56 | model_max_length - len(input_ids)
57 | ) + input_ids
58 | labels = [tokenizer.pad_token_id] * (
59 | model_max_length - len(labels)
60 | ) + labels
61 | attention_mask = [0] * (
62 | model_max_length - len(attention_mask)
63 | ) + attention_mask
64 | else:
65 | input_ids = input_ids + [tokenizer.pad_token_id] * (
66 | model_max_length - len(input_ids)
67 | )
68 | labels = labels + [tokenizer.pad_token_id] * (
69 | model_max_length - len(labels)
70 | )
71 | attention_mask = attention_mask + [0] * (
72 | model_max_length - len(attention_mask)
73 | )
74 |
75 | tokenized_full_prompt = {
76 | "input_ids": input_ids,
77 | "attention_mask": attention_mask,
78 | "labels": labels,
79 | }
80 | return tokenized_full_prompt
81 |
82 |
83 | def batch_grouped_pretrain_generate(
84 | model_max_length: int,
85 | tokenizer: PreTrainedTokenizer,
86 | examples: Dict[str, List[str]],
87 | ) -> Dict[str, List[List[int]]]:
88 | # build grouped texts with format `X1 X2 X3 ... X1 X2 X3 ... []`
89 | token_ids_list: List[List[int]] = tokenizer(
90 | examples["text"], add_special_tokens=False
91 | )["input_ids"]
92 | token_ids_list = [
93 | token_ids + [tokenizer.eos_token_id] for token_ids in token_ids_list
94 | ]
95 | concatenated_ids = list(chain(*token_ids_list))
96 | # we drop the small remainder, and if the total_length < block_size, we exclude this batch
97 | total_length = (len(concatenated_ids) // model_max_length) * model_max_length
98 | result = [
99 | concatenated_ids[i : i + model_max_length]
100 | for i in range(0, total_length, model_max_length)
101 | ]
102 | return {"input_ids": result, "labels": result.copy()}
103 |
104 |
105 | def exam_generate(model_max_length: int, tokenizer: PreTrainedTokenizer, data_point):
106 | template = "Human: \n{human}\n\nAssistant: \n"
107 | # pudb.set_trace()
108 | input_str = template.format(
109 | human=f'回答下面的{data_point["type"]}题,用json返回答案,包括原因和答案,如{{"reason":..., "answer":...}}\n{data_point["question"]}\n选项:{" ".join(data_point["candidates"])}'
110 | )
111 | input_ids = tokenizer.encode(input_str, add_special_tokens=False)
112 | labels = [IGNORE_INDEX] * len(input_ids)
113 | bot_ids = tokenizer.encode(
114 | json.dumps(
115 | {"reason": data_point["reason"], "answer": data_point["answer"]},
116 | ensure_ascii=False,
117 | ),
118 | add_special_tokens=False,
119 | )
120 | input_ids += bot_ids
121 | labels += bot_ids
122 |
123 | input_ids += [tokenizer.eos_token_id]
124 | labels += [tokenizer.eos_token_id]
125 |
126 | input_ids = input_ids[: model_max_length - 1]
127 | labels = labels[: model_max_length - 1]
128 | return {
129 | "input_ids": input_ids,
130 | "attention_mask": [1] * len(input_ids),
131 | "labels": labels,
132 | }
133 |
--------------------------------------------------------------------------------
/train/reproduce_our_papers/Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md:
--------------------------------------------------------------------------------
1 | ### [Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://github.com/LianjiaTech/BELLE/blob/main/docs/Towards%20Better%20Instruction%20Following%20Language%20Models%20for%20Chinese.pdf)
2 |
3 | # 论文简介
4 |
5 | 这篇论文研究了基于当前公开的指令数据训练得到的模型,能达到怎样的效果。我们在自有的1000条测试集上进行了量化评估,同时,为了提高模型在中文领域的性能和训练/推理效率,我们进一步扩展了LLaMA的词汇表,并在34亿个中文词汇上进行了二次预训练。
6 |
7 | 我们使用到的公开指令训练数据有:
8 |
9 | 1. GPT-3.5生成的Stanford alpaca 中文以及英文数据
10 | 2. GPT-4生成的Alpaca 中文以及英文数据
11 | 3. 用户分享的与ChatGPT的真实对话数据sharegpt
12 |
13 | 我们着眼于探究训练数据类别对模型性能的影响。具体而言,我们考察了训练数据的数量、质量和语言分布等因素。
14 |
15 | 实验结果如下:
16 |
17 |
18 |
19 | | Factor |
20 | Base model |
21 | Training data |
22 | Score_w/o_others |
23 |
24 | | 词表扩充 |
25 | LLaMA-7B-EXT |
26 | zh(alpaca-3.5&4) + sharegpt |
27 | 0.670 |
28 |
29 |
30 | | LLaMA-7B |
31 | zh(alpaca-3.5&4) + sharegpt |
32 | 0.652 |
33 |
34 |
35 | | 数据质量 |
36 | LLaMA-7B-EXT |
37 | zh(alpaca-3.5) |
38 | 0.642 |
39 |
40 |
41 | | LLaMA-7B-EXT |
42 | zh(alpaca-4) |
43 | 0.693 |
44 |
45 |
46 | | 数据语言分布 |
47 | LLaMA-7B-EXT |
48 | cn(alpaca-3.5&4) |
49 | 0.679 |
50 |
51 |
52 | | LLaMA-7B-EXT |
53 | en(alpaca-3.5&4) |
54 | 0.659 |
55 |
56 |
57 | | LLaMA-7B-EXT |
58 | zh(alpaca-3.5&4) + sharegpt |
59 | 0.670 |
60 |
61 |
62 | | LLaMA-7B-EXT |
63 | en(alpaca-3.5&4) + sharegpt |
64 | 0.668 |
65 |
66 |
67 | | 数据规模 |
68 | LLaMA-7B-EXT |
69 | zh(alpaca-3.5&4) + sharegpt |
70 | 0.670 |
71 |
72 |
73 | | LLaMA-7B-EXT |
74 | zh(alpaca-3.5&4) + sharegpt + BELLE-0.5M-CLEAN |
75 | 0.762 |
76 |
77 |
78 | | - |
79 | ChatGPT |
80 | - |
81 | 0.824 |
82 |
83 |
84 | 其中**BELLE-0.5M-CLEAN**是从我们内部的230万指令数据中清洗得到0.5M数据,其中包含单轮和多轮对话数据,和之前开放的0.5M数据不是同一批数据。这份数据还未开源,但是我们已经将实验中效果最好的模型 (score 0.762) 开源在[Hugging Face](https://huggingface.co/BelleGroup/BELLE-on-Open-Datasets).
85 |
86 | # 准备数据集
87 |
88 | ### 下载数据集
89 |
90 | 我们的论文中使用了五个开源数据集:
91 |
92 | | Data | URL |
93 | | -------------- | -------------------------------------------------------------------------------------------------- |
94 | | alpaca-3.5-en | https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json |
95 | | alpaca-3.5-zh | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/dat |
96 | | alpaca-4-en | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data.json |
97 | | alpaca-4-zh | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data_zh.json |
98 | | sharegpt$^1$ | https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main |
99 |
100 | $^1$:请注意,sharegpt 数据集可能会持续更新,使得与我们论文中使用的略有不同,但不会对实验结论有大的影响。
101 |
102 | ### 预处理
103 |
104 | ##### 清洗 sharegpt 数据集
105 |
106 | 我们采用了[Vicuna](https://github.com/lm-sys/FastChat/blob/main/docs/commands/data_cleaning.md)中的数据清洗方法。
107 |
108 | 1. 通过 `fastchat.data.clean_sharegpt` 将 html 转换为 markdown
109 | 2. 通过 `fastchat.data.optional_clean` 删除除英文和中文之外的其他语言
110 | 3. 通过 `fastchat.data.split_long_conversation` 将最大长度为 2048 个令牌的对话分割开
111 |
112 | #### 统一数据格式
113 |
114 | 我们将所有数据集统一为以下形式:
115 |
116 | ```python
117 | {
118 | "id": "uniq_sample_id",
119 | "conversations": [
120 | {"from": "human", "value": "你好"},
121 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"},
122 | {"from": "human", "value": "今天天气怎么样?"},
123 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"}
124 | ]
125 | }
126 |
127 | ```
128 |
129 | # 下载 LLaMA-EXT-7B
130 |
131 | LLaMA-EXT-7B 是基于 [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) 的模型,进一步在 34 亿中文词汇上进行预训练,将模型词表大小扩展为 79,458。
132 | 现在它已经在[Hugging Face上线](https://huggingface.co/BelleGroup/BELLE-LLaMA-EXT-7B),你应该基于它进行后续的模型训练。
133 |
134 | # 训练
135 |
136 | 以 LLaMA-EXT-7B 为基础模型,我们以如下超参数对模型进行训练。
137 |
138 | | Hyper parameter | Value |
139 | | ----------------- | ------ |
140 | | Precision | bf16 |
141 | | Epochs | 3 |
142 | | Batch size | 32 |
143 | | Learning rate | 5e-6 |
144 | | Weight decay | 0 |
145 | | Warmup ratio | 0.03 |
146 | | LR scheduler type | cosine |
147 | | Max length | 2048 |
148 |
149 | 可以使用我们开源的[代码](../README.md)进行模型训练。
150 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/reproduce_our_papers/Towards Better Instruction Following Language Models for Chinese_ Investigating the Impact of Training Data and Evaluation.md:
--------------------------------------------------------------------------------
1 | ### [Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://github.com/LianjiaTech/BELLE/blob/main/docs/Towards%20Better%20Instruction%20Following%20Language%20Models%20for%20Chinese.pdf)
2 |
3 | # 论文简介
4 |
5 | 这篇论文研究了基于当前公开的指令数据训练得到的模型,能达到怎样的效果。我们在自有的1000条测试集上进行了量化评估,同时,为了提高模型在中文领域的性能和训练/推理效率,我们进一步扩展了LLaMA的词汇表,并在34亿个中文词汇上进行了二次预训练。
6 |
7 | 我们使用到的公开指令训练数据有:
8 |
9 | 1. GPT-3.5生成的Stanford alpaca 中文以及英文数据
10 | 2. GPT-4生成的Alpaca 中文以及英文数据
11 | 3. 用户分享的与ChatGPT的真实对话数据sharegpt
12 |
13 | 我们着眼于探究训练数据类别对模型性能的影响。具体而言,我们考察了训练数据的数量、质量和语言分布等因素。
14 |
15 | 实验结果如下:
16 |
17 |
18 |
19 | | Factor |
20 | Base model |
21 | Training data |
22 | Score_w/o_others |
23 |
24 | | 词表扩充 |
25 | LLaMA-7B-EXT |
26 | zh(alpaca-3.5&4) + sharegpt |
27 | 0.670 |
28 |
29 |
30 | | LLaMA-7B |
31 | zh(alpaca-3.5&4) + sharegpt |
32 | 0.652 |
33 |
34 |
35 | | 数据质量 |
36 | LLaMA-7B-EXT |
37 | zh(alpaca-3.5) |
38 | 0.642 |
39 |
40 |
41 | | LLaMA-7B-EXT |
42 | zh(alpaca-4) |
43 | 0.693 |
44 |
45 |
46 | | 数据语言分布 |
47 | LLaMA-7B-EXT |
48 | cn(alpaca-3.5&4) |
49 | 0.679 |
50 |
51 |
52 | | LLaMA-7B-EXT |
53 | en(alpaca-3.5&4) |
54 | 0.659 |
55 |
56 |
57 | | LLaMA-7B-EXT |
58 | zh(alpaca-3.5&4) + sharegpt |
59 | 0.670 |
60 |
61 |
62 | | LLaMA-7B-EXT |
63 | en(alpaca-3.5&4) + sharegpt |
64 | 0.668 |
65 |
66 |
67 | | 数据规模 |
68 | LLaMA-7B-EXT |
69 | zh(alpaca-3.5&4) + sharegpt |
70 | 0.670 |
71 |
72 |
73 | | LLaMA-7B-EXT |
74 | zh(alpaca-3.5&4) + sharegpt + BELLE-0.5M-CLEAN |
75 | 0.762 |
76 |
77 |
78 | | - |
79 | ChatGPT |
80 | - |
81 | 0.824 |
82 |
83 |
84 | 其中**BELLE-0.5M-CLEAN**是从我们内部的230万指令数据中清洗得到0.5M数据,其中包含单轮和多轮对话数据,和之前开放的0.5M数据不是同一批数据。这份数据还未开源,但是我们已经将实验中效果最好的模型 (score 0.762) 开源在[Hugging Face](https://huggingface.co/BelleGroup/BELLE-on-Open-Datasets).
85 |
86 | # 准备数据集
87 |
88 | ### 下载数据集
89 |
90 | 我们的论文中使用了五个开源数据集:
91 |
92 | | Data | URL |
93 | | -------------- | -------------------------------------------------------------------------------------------------- |
94 | | alpaca-3.5-en | https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json |
95 | | alpaca-3.5-zh | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/dat |
96 | | alpaca-4-en | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data.json |
97 | | alpaca-4-zh | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data_zh.json |
98 | | sharegpt$^1$ | https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main |
99 |
100 | $^1$:请注意,sharegpt 数据集可能会持续更新,使得与我们论文中使用的略有不同,但不会对实验结论有大的影响。
101 |
102 | ### 预处理
103 |
104 | ##### 清洗 sharegpt 数据集
105 |
106 | 我们采用了[Vicuna](https://github.com/lm-sys/FastChat/blob/main/docs/commands/data_cleaning.md)中的数据清洗方法。
107 |
108 | 1. 通过 `fastchat.data.clean_sharegpt` 将 html 转换为 markdown
109 | 2. 通过 `fastchat.data.optional_clean` 删除除英文和中文之外的其他语言
110 | 3. 通过 `fastchat.data.split_long_conversation` 将最大长度为 2048 个令牌的对话分割开
111 |
112 | #### 统一数据格式
113 |
114 | 我们将所有数据集统一为以下形式:
115 |
116 | ```python
117 | {
118 | "id": "uniq_sample_id",
119 | "conversations": [
120 | {"from": "human", "value": "你好"},
121 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"},
122 | {"from": "human", "value": "今天天气怎么样?"},
123 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"}
124 | ]
125 | }
126 |
127 | ```
128 |
129 | # 下载 LLaMA-EXT-7B
130 |
131 | LLaMA-EXT-7B 是基于 [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) 的模型,进一步在 34 亿中文词汇上进行预训练,将模型词表大小扩展为 79,458。
132 | 现在它已经在[Hugging Face上线](https://huggingface.co/BelleGroup/BELLE-LLaMA-EXT-7B),你应该基于它进行后续的模型训练。
133 |
134 | # 训练
135 |
136 | 以 LLaMA-EXT-7B 为基础模型,我们以如下超参数对模型进行训练。
137 |
138 | | Hyper parameter | Value |
139 | | ----------------- | ------ |
140 | | Precision | bf16 |
141 | | Epochs | 3 |
142 | | Batch size | 32 |
143 | | Learning rate | 5e-6 |
144 | | Weight decay | 0 |
145 | | Warmup ratio | 0.03 |
146 | | LR scheduler type | cosine |
147 | | Max length | 2048 |
148 |
149 | 可以使用我们开源的[代码](../README.md)进行模型训练。
150 |
--------------------------------------------------------------------------------
/train/src/callbacks.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import torch
3 | from tqdm import tqdm
4 | from transformers import (
5 | TrainerCallback,
6 | Trainer,
7 | TrainingArguments,
8 | TrainerState,
9 | TrainerControl,
10 | PreTrainedModel,
11 | GenerationConfig
12 | )
13 | from typing import Callable, Union
14 | import weakref
15 | from torch.utils.data import DataLoader, SequentialSampler
16 | import pudb
17 |
18 |
19 | class GenerationAndSaveBestModelsCallback(TrainerCallback):
20 | def __init__(
21 | self,
22 | trainer: Trainer,
23 | metric_fn: Callable[
24 | [
25 | Union[torch.Tensor, torch.cuda.FloatTensor],
26 | Union[torch.Tensor, torch.cuda.FloatTensor],
27 | ],
28 | float,
29 | ],
30 | eval_dataset,
31 | save_dir,
32 | num_best_models=3,
33 | eval_steps: int = 100,
34 | ):
35 | super().__init__()
36 | self.trainer_ref = weakref.ref(trainer) # 创建Trainer的弱引用
37 | ignored_columns = list(
38 | set(eval_dataset.column_names) - {'input_ids', 'labels'})
39 | eval_dataset = eval_dataset.remove_columns(ignored_columns)
40 | self.eval_dataloader = DataLoader(
41 | eval_dataset,
42 | sampler=SequentialSampler(eval_dataset),
43 | batch_size=trainer.args.eval_batch_size,
44 | collate_fn=trainer.data_collator,
45 | drop_last=trainer.args.dataloader_drop_last,
46 | num_workers=trainer.args.dataloader_num_workers,
47 | pin_memory=trainer.args.dataloader_pin_memory,
48 | )
49 | self.save_dir = save_dir
50 | self.num_best_models = num_best_models
51 | self.eval_steps = eval_steps
52 | self.best_eval_results = []
53 | self.metric_fn = metric_fn
54 |
55 | self.generation_config = GenerationConfig(
56 | temperature=0.001,
57 | top_k=30,
58 | top_p=0.85,
59 | do_sample=True,
60 | num_beams=1,
61 | bos_token_id=1,
62 | eos_token_id=2,
63 | pad_token_id=0,
64 | repetition_penalty=1.2,
65 | max_new_tokens=1024,
66 | min_new_tokens=1,
67 | )
68 |
69 | def on_step_end(
70 | self,
71 | args: TrainingArguments,
72 | state: TrainerState,
73 | control: TrainerControl,
74 | **kwargs,
75 | ):
76 | if state.global_step % self.eval_steps == 0:
77 | trainer = self.trainer_ref() # 获取Trainer对象的引用
78 | if trainer is None:
79 | return
80 | if args.local_rank != -1:
81 | torch.distributed.barrier() # 同步所有进程
82 | # TODO: 只实现了单个rank的eval, 还没实现多个rank并行eval
83 | if state.is_local_process_zero:
84 | model: PreTrainedModel = trainer.model
85 | model.eval()
86 |
87 | with torch.no_grad():
88 | metric_value = 0
89 | total_samples = 0
90 | pudb.set_trace()
91 | for batch in tqdm(self.eval_dataloader, desc='Customize Eval'):
92 | input_ids = batch["input_ids"].to(model.device)
93 | generated_ids = model.generate(
94 | input_ids=input_ids, generation_config=self.generation_config)
95 | # 从generated_ids计算得到的指标值
96 | metric = self.metric_fn(generated_ids, batch["labels"])
97 | metric_value += metric * input_ids.size(0)
98 | total_samples += input_ids.size(0)
99 | metric_value /= total_samples
100 | if len(self.best_eval_results) < self.num_best_models:
101 | self.best_eval_results.append(
102 | (metric_value, state.global_step))
103 | # TODO: lora需要额外处理
104 | model.save_pretrained(
105 | f"{self.save_dir}/best_model_{state.global_step}"
106 | )
107 | else:
108 | self.best_eval_results.sort()
109 | worst_result, worst_step = self.best_eval_results[0]
110 | if metric_value > worst_result:
111 | self.best_eval_results[0] = (
112 | metric_value,
113 | state.global_step,
114 | )
115 | # TODO: lora需要额外处理
116 | model.save_pretrained(
117 | f"{self.save_dir}/best_model_{state.global_step}"
118 | )
119 | shutil.rmtree(
120 | f"{self.save_dir}/best_model_{worst_step}")
121 | if args.local_rank != -1:
122 | torch.distributed.barrier() # 同步所有进程
123 |
124 |
125 | def metric_fn(generated_ids, label_ids):
126 | # 这是一个简单的示例,实际的metric_fn应该根据你的具体任务和评估标准来定义
127 | return generated_ids.mean().item()
128 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/utils/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 | import os
6 | import torch
7 | import random
8 | import numpy as np
9 | from transformers import set_seed
10 | import deepspeed
11 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
12 |
13 |
14 | def print_rank_0(msg, rank=0):
15 | if rank <= 0:
16 | print(msg)
17 |
18 |
19 | def to_device(batch, device):
20 | output = {}
21 | for k, v in batch.items():
22 | try:
23 | output[k] = v.to(device)
24 | except:
25 | output[k] = v
26 | return output
27 |
28 |
29 | class MovingAverage:
30 |
31 | def __init__(self):
32 | self.count = 0
33 | self.total = 0
34 | self.mean = 0
35 |
36 | def update(self, num):
37 | self.total += num
38 | self.count += 1
39 | self.mean = self.total / self.count
40 |
41 | return self.mean
42 |
43 |
44 | def save_hf_format(model, tokenizer, args, sub_folder=""):
45 | # used to save huggingface format, so we can use it for hf.from_pretrained
46 | model_to_save = model.module if hasattr(model, 'module') else model
47 | CONFIG_NAME = "config.json"
48 | WEIGHTS_NAME = "pytorch_model.bin"
49 | output_dir = os.path.join(args.output_dir, sub_folder)
50 | if not os.path.exists(output_dir):
51 | os.makedirs(output_dir)
52 | output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
53 | output_config_file = os.path.join(output_dir, CONFIG_NAME)
54 | save_dict = model_to_save.state_dict()
55 | for key in list(save_dict.keys()):
56 | if "lora" in key:
57 | del save_dict[key]
58 | torch.save(save_dict, output_model_file)
59 | model_to_save.config.to_json_file(output_config_file)
60 | tokenizer.save_vocabulary(output_dir)
61 |
62 |
63 | def set_random_seed(seed):
64 | if seed is not None:
65 | set_seed(seed)
66 | random.seed(seed)
67 | np.random.seed(seed)
68 | torch.manual_seed(seed)
69 | torch.cuda.manual_seed_all(seed)
70 |
71 |
72 | def get_all_reduce_mean(tensor):
73 | torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
74 | tensor = tensor / torch.distributed.get_world_size()
75 | return tensor
76 |
77 |
78 | def get_optimizer_grouped_parameters(model,
79 | weight_decay,
80 | no_decay_name_list=[
81 | "bias", "LayerNorm.weight"
82 | ]):
83 | optimizer_grouped_parameters = [
84 | {
85 | "params": [
86 | p for n, p in model.named_parameters()
87 | if (not any(nd in n
88 | for nd in no_decay_name_list) and p.requires_grad)
89 | ],
90 | "weight_decay":
91 | weight_decay,
92 | },
93 | {
94 | "params": [
95 | p for n, p in model.named_parameters()
96 | if (any(nd in n
97 | for nd in no_decay_name_list) and p.requires_grad)
98 | ],
99 | "weight_decay":
100 | 0.0,
101 | },
102 | ]
103 | return optimizer_grouped_parameters
104 |
105 |
106 | def _z3_params_to_fetch(param_list):
107 | return [
108 | p for p in param_list
109 | if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE
110 | ]
111 |
112 |
113 | def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0):
114 | zero_stage_3 = (zero_stage == 3)
115 | with torch.no_grad():
116 | for param, param_ema in zip(model.parameters(),
117 | model_ema.parameters()):
118 | # TODO: use prefiltering for efficiency
119 | params_to_fetch = _z3_params_to_fetch([param, param_ema
120 | ]) if zero_stage_3 else []
121 | should_gather_param = len(params_to_fetch) > 0
122 | with deepspeed.zero.GatheredParameters(
123 | params_to_fetch, enabled=should_gather_param):
124 | data = param.data
125 | if device is not None:
126 | data = data.to(device)
127 | param_ema.data.copy_(torch.lerp(data, param_ema.data, beta))
128 |
129 |
130 | def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
131 | zero_stage_3 = (zero_stage == 3)
132 | os.makedirs(save_dir, exist_ok=True)
133 | WEIGHTS_NAME = "pytorch_model.bin"
134 | output_model_file = os.path.join(save_dir, WEIGHTS_NAME)
135 |
136 | model_to_save = model_ema.module if hasattr(model_ema,
137 | 'module') else model_ema
138 | if not zero_stage_3:
139 | if global_rank == 0:
140 | torch.save(model_to_save.state_dict(), output_model_file)
141 | else:
142 | output_state_dict = {}
143 | for k, v in model_to_save.named_parameters():
144 |
145 | if hasattr(v, 'ds_id'):
146 | with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v
147 | ]),
148 | enabled=zero_stage_3):
149 | v_p = v.data.cpu()
150 | else:
151 | v_p = v.cpu()
152 | if global_rank == 0 and "lora" not in k:
153 | output_state_dict[k] = v_p
154 | if global_rank == 0:
155 | torch.save(output_state_dict, output_model_file)
156 | del output_state_dict
157 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/prompt_eval.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import json
3 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, AutoConfig
4 | import argparse
5 | from tqdm import tqdm
6 | import json, os
7 | parser = argparse.ArgumentParser()
8 |
9 | parser.add_argument('--model_name_or_path',required=True,type=str)
10 | parser.add_argument('--finetuned_model_name_or_path',required=True,type=str)
11 | parser.add_argument('--test_file',required=True,type=str)
12 | parser.add_argument('--predictions_file', default='./predictions.json', type=str)
13 | args = parser.parse_args()
14 |
15 | print("test_file: " + args.test_file)
16 | print("model_name_or_path: " + args.model_name_or_path)
17 | print("finetuned_model_name_or_path: " + args.finetuned_model_name_or_path)
18 |
19 | max_new_tokens = 1024
20 | generation_config = dict(
21 | temperature=0.001,
22 | top_k=30,
23 | top_p=0.85,
24 | do_sample=True,
25 | num_beams=1,
26 | repetition_penalty=1.2,
27 | max_new_tokens=max_new_tokens
28 | )
29 |
30 |
31 | def read_data(filename):
32 | res = []
33 | with open(filename, 'r', encoding='utf-8') as f:
34 | lines = f.readlines()
35 | for line in lines:
36 | res.append(json.loads(line.strip()))
37 | return res
38 |
39 |
40 | input_items = read_data(args.test_file)
41 | output_items = []
42 |
43 | def write_data(filename, examples):
44 | with open(filename, 'w', encoding='utf-8') as f:
45 | for example in examples:
46 | f.write(json.dumps(example, ensure_ascii=False) + "\n")
47 |
48 | print("predictions will be written at {}".format(args.predictions_file))
49 |
50 | def get_input_text(input_item):
51 | conversations = input_item['conversations']
52 | conv_turn = len(conversations)
53 | for i, sentence in conversations:
54 | sentence_from = sentence["from"].lower()
55 | sentence_value = 'Human: ' + sentence["value"] + '\n\nAssistant: ' if sentence_from == 'human' else sentence["value"]
56 | conversation += sentence_value
57 | sentence_ids = tokenizer.encode(sentence_value, add_special_tokens=False)#do not add bos_token_id
58 | label = copy.deepcopy(sentence_ids) if sentence_from != 'human' else [IGNORE_INDEX] * len(sentence_ids)
59 | input_ids += sentence_ids
60 |
61 | def _addrole_masklabel_tokenize(source):
62 | '''
63 | add speaker and concatenate the sentences
64 | {
65 | "id": "uniq_sample_id",
66 | "conversations": [
67 | {"from": "human", "value": "你好"},
68 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"},
69 | {"from": "human", "value": "今天天气怎么样?"},
70 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"}
71 | ]
72 | }
73 | tokenizer_bloomz.encode("你好,有什么可以帮助你的吗?") == [41381, 355, 37242, 205599, 7336, 10468]
74 | tokenizer_llama.encode("你好,有什么可以帮助你的吗?") == [1, 29871, 30919, 31076, 30214, 30417, 231, 190, 131, 31882, 30682, 30651, 232, 187, 177, 31931, 30919, 30210, 232, 147, 154, 30882]
75 | '''
76 |
77 | conversation = ''
78 | input_ids = []
79 | for sentence in source[:-1]:
80 | sentence_from = sentence["from"].lower()
81 | sentence_value = 'Human: ' + sentence["value"] + '\n\nAssistant: ' if sentence_from == 'human' else sentence["value"]
82 | conversation += sentence_value
83 | sentence_ids = tokenizer.encode(sentence_value, add_special_tokens=False)#do not add bos_token_id
84 | input_ids += sentence_ids
85 | if sentence_from != 'human':
86 | input_ids += [tokenizer.eos_token_id]#make sure eos_token_id is correct
87 |
88 | return input_ids, conversation
89 |
90 | if __name__ == '__main__':
91 | load_type = torch.float16
92 | if torch.cuda.is_available():
93 | device = torch.device(0)
94 | else:
95 | device = torch.device('cpu')
96 |
97 | if "llama" in args.model_name_or_path:
98 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path)
99 | else:
100 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
101 |
102 | tokenizer.pad_token_id = 0
103 | tokenizer.eos_token_id = 2
104 | model_config = AutoConfig.from_pretrained(args.model_name_or_path)
105 | print("Loading model")
106 | model = AutoModelForCausalLM.from_pretrained(
107 | args.finetuned_model_name_or_path,
108 | torch_dtype=load_type,
109 | config=model_config,
110 | )
111 |
112 | model.to(device)
113 | model.eval()
114 | print("Load model successfully")
115 |
116 | index = 0
117 |
118 | for i in tqdm(range(len(input_items)), total=len(input_items), unit="item"):
119 | batch_input_items = input_items[i]
120 |
121 | input_ids, conversation = _addrole_masklabel_tokenize(source=input_items[i]['conversations'])
122 | input_ids = input_ids[:2048]
123 | if "Human" not in conversation:
124 | continue
125 | attention_mask = [1] * len(input_ids)
126 | input_ids = torch.LongTensor(input_ids).unsqueeze(0)
127 | attention_mask = torch.LongTensor(attention_mask).unsqueeze(0)
128 | #(1, max_seq_len)
129 | generation_output = model.generate(
130 | input_ids = input_ids.to(device),
131 | attention_mask = attention_mask.to(device),
132 | **generation_config
133 | )
134 |
135 | generate_text = tokenizer.decode(generation_output[0].cpu().tolist(),skip_special_tokens=True)
136 | output_items.append({"generate_text": generate_text})
137 | if index%10 == 0:
138 | print("conversation: ", conversation)
139 | print("generate_text: ", generate_text)
140 | print("-"*100)
141 | index += 1
142 |
143 | write_data(args.predictions_file, output_items)
144 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CALM: Credit and Risk Assessment Large Language Model
2 |
3 | - Due to licensing restrictions on [LLaMA](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) weights, the model cannot be used for commercial purposes. Please adhere strictly to LLaMA's usage policy.
4 | - Considering the limitations of LLaMA's license, we cannot directly distribute the complete model weights. Here, we are only releasing the LoRA weights of [CALM-7B](https://huggingface.co/daishen/CALM-7B).
5 |
6 | ## Content
7 |
8 | - [CALM: Credit and Risk Assessment Large Language Model](#calm-credit-and-risk-assessment-large-language-model)
9 | - [Content](#content)
10 | - [1. Preparing the environment](#1-preparing-the-environment)
11 | - [2. Run](#2-run)
12 | - [2.1 Download data](#21-download-data)
13 | - [2.1.1 Convert data format](#211-convert-data-format)
14 | - [2.2 Model training](#22-model-training)
15 | - [2.2.1 LoRA](#221-lora)
16 | - [2.2.2 Merge Model with LORA](#222-merge-model-with-lora)
17 |
18 | ## 1. Preparing the environment
19 |
20 | Creating the environment using Conda, followed by installing the required packages using pip.
21 |
22 | ```bash
23 | pip install -r requirements.txt
24 | ```
25 |
26 | ## 2. Run
27 |
28 | ### 2.1 Download data
29 |
30 | Before running, please download [rawdata](https://huggingface.co/datasets/daishen/CALM-Data/tree/main) to `data/CRA_resample_0.045M.json`
31 |
32 | #### 2.1.1 Convert data format
33 |
34 | ```bash
35 | export raw_data=/path_to/CRA_resample_0.045M.json
36 | export conv_data=/path_to/CRA_resample_0.045M_conv.json
37 | export data_name=CRA
38 | export dev_data=/path_to/CRA-resample-dev3k.json
39 | export train_data=/path_to/CRA-resample-train4w.json
40 |
41 | python scripts/convert_to_conv_data.py \
42 | --orig_data ${raw_data} \
43 | --write_data ${conv_data} \
44 | --dataset_name CRA
45 | head -n 3000 ${conv_data} > ${dev_data}
46 | tail -n +3001 ${conv_data} > ${train_data}
47 | ```
48 |
49 | We designate the first 3000 entries as the validation set, while the remaining data serves as the training set.
50 |
51 | ### 2.2 Model training
52 |
53 | Training strategy
54 |
55 | * LoRA + int8
56 |
57 | The initiation script for training is written in `train/scripts/run.sh`. You will need to modify the parameters in `run.sh` according to your specific requirements.
58 |
59 | ```bash
60 | bash scripts/run_sft.sh
61 | ```
62 |
63 | - model_name_or_path: The pretrained model (if it is an [LLaMA](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model, it needs to be converted to the hf format beforehand in order to be loaded using from_pretrained)
64 | - train_file: Training data
65 | - validation_file: Validation data
66 | - output_dir: Path to the training logs and model saves
67 | - cache_dir: Path to the cache data processing process
68 | - cutoff_len: Maximum input sequence length ([LLaMA](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model suggests setting it to 1024 or above, [Bloom](https://huggingface.co/bigscience/bloom) model suggests setting it to 512 or above)
69 |
70 | #### 2.2.1 LoRA
71 |
72 | ```bash
73 | nohup torchrun --nproc_per_node 2 src/entry_point/sft_train.py \
74 | --model_name_or_path ${model_name_or_path} \
75 | --bf16 True \
76 | --llama True \
77 | --use_lora True \
78 | --deepspeed configs/deepspeed_config_stage3.json \
79 | --lora_config configs/lora_config_llama.json \
80 | --train_file ${train_file} \
81 | --validation_file ${validation_file} \
82 | --per_device_train_batch_size 6 \
83 | --per_device_eval_batch_size 6 \
84 | --gradient_accumulation_steps 1 \
85 | --num_train_epochs 5 \
86 | --model_max_length ${cutoff_len} \
87 | --save_strategy "steps" \
88 | --save_total_limit 3 \
89 | --learning_rate 3e-4 \
90 | --weight_decay 0.00001 \
91 | --warmup_ratio 0.01 \
92 | --lr_scheduler_type "cosine" \
93 | --logging_steps 10 \
94 | --evaluation_strategy "steps" \
95 | --seed 1234 \
96 | --gradient_checkpointing \
97 | --cache_dir ${cache_dir} \
98 | --output_dir ${output_dir} \
99 | --overwrite_output_dir \
100 | > ${log_dir}/train.log 2>&1 &
101 | ```
102 |
103 | **Parameters**
104 |
105 | * use_lora: Training using LoRA
106 | * use_int8_training: Training with 8-bit quantization, which significantly reduces memory usage
107 | * lora_config: The parameter configuration for LoRA is provided. If training a [Bloom](https://huggingface.co/bigscience/bloom) model, it should be changed to "`configs/lora_config_bloom.json`"
108 | * deepspeed When training sequences are long, it is recommended to utilize deepspeed stage 3, which effectively distributes model parameters across multiple cards, allowing room to load even longer sequences
109 |
110 | **Note:** Please be aware that you can only choose between "`use_int8_training`" and "`deepspeed`"; they cannot be used simultaneously.
111 |
112 | The structure of the **output_dir**:
113 |
114 | ```
115 | output_dir/
116 | ├── checkpoint-244/
117 | │ ├── pytorch_model.bin
118 | │ └── trainer_state.json
119 | ├── checkpoint-527/
120 | │ ├── pytorch_model.bin
121 | │ └── trainer_state.json
122 | ├── adapter_model.bin
123 | ├── print_log.txt
124 | └── adapter_config.json
125 | ```
126 |
127 | The highest-level directory stores the final model obtained from the training process.
128 |
129 | #### 2.2.2 Merge Model with LORA
130 |
131 | If you wish to merge the weights of LoRA with a pre-trained model, you can execute the following command:
132 |
133 | ```bash
134 | model_name_or_path=model_path_to/llama-2-7b-chat-T/
135 | lora_path=lora_path_to/checkpoint_2/3739
136 | output_path=out_path_to/CRA__model_2/model_3739
137 |
138 | CUDA_VISIBLE_DEVICES=0 python src/merge_llama_with_lora.py \
139 | --model_name_or_path ${model_name_or_path} \
140 | --output_path ${output_path} \
141 | --lora_path ${lora_path} \
142 | --llama
143 | ```
144 |
145 | The merged weights will be saved in the "`output_path`" directory. You can subsequently load them directly using "`from_pretrained`".
146 |
--------------------------------------------------------------------------------
/train/debug/app.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Tuple, Union
2 | from peft import PeftModel
3 | import torch
4 | from transformers import LlamaTokenizer, AutoModelForCausalLM, GenerationConfig
5 | from flask import Flask, render_template, request
6 | import sys
7 | sys.path.append("../src")
8 |
9 |
10 | def load_model():
11 | model_name_or_path = '/nfs/a100-80G-17/jiyunjie/finetuned_ckpt/on_belle_tokenizer50k_openinstr_zh/zh_alpaca_gpt3.5_gpt4_sharegpt_epoch=2-step=20652'
12 | ckpt_path = '/nfs/a100-006/hanweiguang/saved_model/boxue_1_bs-8_lr-3e-4_wm-1e-2_epoch-10_lora'
13 |
14 | print('Loading model...')
15 | # Initialize the model and tokenizer
16 | load_type = torch.float16
17 | tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path)
18 | tokenizer.pad_token_id = 0
19 | tokenizer.bos_token_id = 1
20 | tokenizer.eos_token_id = 2
21 | tokenizer.padding_side = "left"
22 | base_model = AutoModelForCausalLM.from_pretrained(
23 | model_name_or_path, torch_dtype=load_type
24 | )
25 | model = PeftModel.from_pretrained(
26 | base_model, ckpt_path, torch_dtype=load_type
27 | )
28 | model.eval()
29 | model.to('cuda:0')
30 | print('Model loaded!')
31 | return model, tokenizer
32 |
33 | # Initialize flask
34 | app = Flask(__name__)
35 |
36 | model, tokenizer = None, None
37 |
38 |
39 | @app.before_first_request
40 | def initialize_model():
41 | global model, tokenizer
42 | model, tokenizer = load_model()
43 |
44 |
45 | def get_probability_per_token(input_ids: torch.Tensor) -> Tuple[List[float], float]:
46 | """
47 | input_ids: [1, sen_len]
48 | """
49 | with torch.no_grad():
50 | outputs = model(input_ids=input_ids, labels=input_ids)
51 | logits = outputs.logits
52 | probs = logits.softmax(dim=-1)
53 | # Select the probabilities of the generated tokens
54 | # [1, sen_len, 1]
55 | generated_token_probs = torch.gather(
56 | probs[:, :-1, :], 2, input_ids[:, 1:, None]) # skip the first token_id
57 | return generated_token_probs[0, ..., 0].tolist(), outputs.loss.item()
58 |
59 |
60 | def decode_and_get_probability(input_text: str, generation_config: GenerationConfig):
61 | input_ids = tokenizer.encode(input_text, return_tensors='pt').to(
62 | getattr(model, 'module', model).device)
63 | with torch.no_grad():
64 | output = model.generate(input_ids=input_ids,
65 | generation_config=generation_config)
66 | token_ids = output[0].tolist()
67 | tokens = [tokenizer.decode([token_id], skip_special_tokens=False)
68 | for token_id in token_ids]
69 | token_probs_list, loss = get_probability_per_token(output)
70 |
71 | # The probability of is set to 0
72 | tokens_with_probs = [{'token': tokens[0], 'prob': 0.0}]
73 | for token, prob in zip(tokens[1:], token_probs_list):
74 | tokens_with_probs.append({
75 | 'token': token,
76 | 'prob': prob
77 | })
78 |
79 | return tokens_with_probs, loss
80 |
81 |
82 | def get_probability_of_text(input_text: str) -> List[Dict[str, Union[float, int]]]:
83 | input_ids = tokenizer.encode(
84 | input_text, return_tensors='pt').to(
85 | getattr(model, 'module', model).device
86 | )
87 | token_ids = input_ids[0].tolist()
88 | tokens = [tokenizer.decode([token_id], skip_special_tokens=False)
89 | for token_id in token_ids]
90 | token_probs_list, loss = get_probability_per_token(input_ids)
91 |
92 | # The probability of is set to 0
93 | tokens_with_probs = [{'token': tokens[0], 'prob': 0.0}]
94 | for token, prob in zip(tokens[1:], token_probs_list):
95 | tokens_with_probs.append({
96 | 'token': token,
97 | 'prob': prob
98 | })
99 |
100 | return tokens_with_probs, loss
101 |
102 |
103 | @app.route("/inference", methods=['POST'])
104 | def inference():
105 | input_text = request.form.get('input_text')
106 | if input_text is None:
107 | # 处理找不到输入文本的情况
108 | return render_template('error.html', message='Input text is missing')
109 | tokens_with_probs, loss = get_probability_of_text(input_text)
110 | return render_template('output.html', tokens_with_probs=tokens_with_probs, loss=loss)
111 |
112 |
113 | @app.route('/generate', methods=['POST'])
114 | def generate():
115 | input_text = request.form.get('input_text')
116 | if input_text is None:
117 | # 处理找不到输入文本的情况
118 | return render_template('error.html', message='Input text is missing')
119 |
120 | # Fetch the parameters from the form
121 | temperature = float(request.form.get('temperature'))
122 | top_p = float(request.form.get('top_p'))
123 | top_k = int(request.form.get('top_k'))
124 | num_beams = int(request.form.get('num_beams'))
125 | max_new_tokens = int(request.form.get('max_new_tokens'))
126 | min_new_tokens = int(request.form.get('min_new_tokens'))
127 | repetition_penalty = float(request.form.get('repetition_penalty'))
128 | do_sample = bool(request.form.get('do_sample'))
129 |
130 | generation_config = GenerationConfig(
131 | temperature=temperature,
132 | top_p=top_p,
133 | top_k=top_k,
134 | num_beams=num_beams,
135 | bos_token_id=1,
136 | eos_token_id=2,
137 | pad_token_id=0,
138 | max_new_tokens=max_new_tokens, # max_length=max_new_tokens+input_sequence
139 | min_new_tokens=min_new_tokens, # min_length=min_new_tokens+input_sequence
140 | repetition_penalty=repetition_penalty,
141 | do_sample=do_sample,
142 | )
143 |
144 | tokens_with_probs, loss = decode_and_get_probability(
145 | input_text, generation_config)
146 | return render_template('output.html', tokens_with_probs=tokens_with_probs, loss=loss)
147 |
148 |
149 | @app.route('/')
150 | def home():
151 | return render_template('index.html')
152 |
153 |
154 | if __name__ == "__main__":
155 | app.run(debug=True, port=5000)
156 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/utils/module/lora.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 | import math
6 | import torch
7 | from torch import nn
8 | import torch.nn.functional as F
9 | from deepspeed.compression.helper import recursive_getattr, recursive_setattr
10 | import deepspeed
11 |
12 |
13 | class LinearLayer_LoRA(nn.Module):
14 | # an simple implementation of LoRA
15 | # for now only support Linear Layer
16 | def __init__(self,
17 | weight,
18 | lora_dim=0,
19 | lora_alpha=16,
20 | lora_droppout=0.05,
21 | bias=None):
22 | super(LinearLayer_LoRA, self).__init__()
23 | self.weight = weight
24 | self.bias = bias
25 |
26 | if lora_dim <= 0:
27 | raise ValueError(
28 | "You are training to use LoRA, whose reduced dim should be larger than 1"
29 | )
30 |
31 | try:
32 | # for zero stage 3
33 | rows, columns = weight.ds_shape
34 | except:
35 | rows, columns = weight.shape
36 | self.lora_right_weight = nn.Parameter(torch.zeros(
37 | columns,
38 | lora_dim)) # apply transpose so in forward we do not need to
39 | self.lora_left_weight = nn.Parameter(torch.zeros(lora_dim, rows))
40 | self.lora_scaling = lora_alpha / lora_dim #alpha/rank
41 |
42 | if lora_droppout > 0:
43 | self.lora_dropout = nn.Dropout(lora_droppout)
44 | else:
45 | self.lora_dropout = nn.Identity()
46 |
47 | self.reset_parameters()
48 | # disable the original weight gradient
49 | self.weight.requires_grad = False
50 | # fuse LoRA to the original weight
51 | self.fuse_lora = False
52 |
53 | def eval(self):
54 | self.lora_dropout.eval()
55 |
56 | # self.fuse_lora_weight()
57 |
58 | def train(self, mode=True):
59 | self.lora_dropout.train(mode)
60 | # self.unfuse_lora_weight()
61 |
62 | def reset_parameters(self):
63 | nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5))
64 | nn.init.zeros_(self.lora_left_weight)
65 |
66 | def fuse_lora_weight(self):
67 | if not self.fuse_lora:
68 | self.weight.data += self.lora_scaling * torch.matmul(
69 | self.lora_left_weight.t(), self.lora_right_weight.t())
70 | self.fuse_lora = True
71 |
72 | def unfuse_lora_weight(self):
73 | if self.fuse_lora:
74 | self.weight.data -= self.lora_scaling * torch.matmul(
75 | self.lora_left_weight.t(), self.lora_right_weight.t())
76 | self.fuse_lora = False
77 |
78 | def forward(self, input):
79 | if self.fuse_lora:
80 | return F.linear(input, self.weight, self.bias)
81 | else:
82 | return F.linear(
83 | input, self.weight,
84 | self.bias) + (self.lora_dropout(input) @ self.lora_right_weight
85 | @ self.lora_left_weight) * self.lora_scaling
86 |
87 |
88 | '''
89 | LLaMA model named_modules:
90 | model.layers.29.mlp
91 | model.layers.29.mlp.gate_proj
92 | model.layers.29.mlp.down_proj
93 | model.layers.29.mlp.up_proj
94 | model.layers.29.mlp.act_fn
95 | model.layers.29.input_layernorm
96 | model.layers.29.post_attention_layernorm
97 | model.layers.30
98 | model.layers.30.self_attn
99 | model.layers.30.self_attn.q_proj
100 | model.layers.30.self_attn.k_proj
101 | model.layers.30.self_attn.v_proj
102 | model.layers.30.self_attn.o_proj
103 | model.layers.30.self_attn.rotary_emb
104 | '''
105 | # convert the linear layer to LoRA
106 | def convert_linear_layer_to_lora(model,
107 | lora_module_name,
108 | lora_dim=0,
109 | lora_alpha=1,
110 | lora_droppout=0):
111 | def set_params(param_modules, name):
112 | for each_ in param_modules:
113 | if each_ in name:
114 | return True
115 | return False
116 |
117 | repalce_name = []
118 | for name, module in model.named_modules():
119 | if isinstance(module, nn.Linear) and set_params(lora_module_name, name):
120 | repalce_name.append(name)
121 |
122 | print("repalce_name : ", repalce_name)
123 | for name in repalce_name:
124 | module = recursive_getattr(model, name)
125 | tmp = LinearLayer_LoRA(
126 | module.weight, lora_dim, lora_alpha, lora_droppout,
127 | module.bias).to(module.weight.device).to(module.weight.dtype)
128 | recursive_setattr(model, name, tmp)
129 | return model
130 |
131 |
132 |
133 | def _z3_params_to_fetch(param_list):
134 | return [
135 | p for p in param_list
136 | if hasattr(p, 'ds_id') and p.ds_status == deepspeed.runtime.zero.
137 | partition_parameters.ZeroParamStatus.NOT_AVAILABLE
138 | ]
139 |
140 |
141 | # convert the LoRA layer to linear layer
142 | def convert_lora_to_linear_layer(model):
143 | repalce_name = []
144 | for name, module in model.named_modules():
145 | if isinstance(module, LinearLayer_LoRA):
146 | repalce_name.append(name)
147 | for name in repalce_name:
148 | module = recursive_getattr(model, name)
149 | zero_stage_3 = hasattr(module.weight, 'ds_id')
150 | with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([
151 | module.weight, module.bias, module.lora_left_weight,
152 | module.lora_right_weight
153 | ]),
154 | modifier_rank=0,
155 | enabled=zero_stage_3):
156 | module.fuse_lora_weight()
157 | return model
158 |
159 |
160 | def only_optimize_lora_parameters(model):
161 | # turn off the gradient of all the parameters except the LoRA parameters
162 | for name, param in model.named_parameters():
163 | if "lora_right_weight" in name or "lora_left_weight" in name:
164 | param.requires_grad = True
165 | else:
166 | param.requires_grad = False
167 | return model
168 |
--------------------------------------------------------------------------------
/train/dschat_train_v1/utils/data/raw_datasets.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # DeepSpeed Team
5 | from datasets import disable_caching
6 | disable_caching()
7 | from datasets import load_dataset
8 | from torch.utils.data import Subset
9 | import re
10 | import os
11 |
12 |
13 | # The template prompt dataset class that all new dataset porting needs to
14 | # follow in order to have a unified API and unified data format.
15 | class PromptRawDataset(object):
16 |
17 | def __init__(self, output_path, seed, local_rank):
18 | self.output_path = output_path
19 | self.seed = seed
20 | self.local_rank = local_rank
21 |
22 | def get_train_data(self):
23 | return
24 |
25 | def get_eval_data(self):
26 | return
27 |
28 | # The prompt should be in the format of: " Human: " + actual_prompt_sentence + " Assistant:"
29 | def get_prompt(self, sample):
30 | return
31 |
32 | # The chosen response should be in the format of: " " + actual_response_sentence
33 | def get_chosen(self, sample):
34 | return
35 |
36 | # The rejected response should be in the format of: " " + actual_response_sentence
37 | # If the dataset does not have rejected response, return None
38 | def get_rejected(self, sample):
39 | return
40 |
41 | def get_prompt_and_chosen(self, sample):
42 | return
43 |
44 | def get_prompt_and_rejected(self, sample):
45 | return
46 |
47 |
48 | # English dataset. https://huggingface.co/datasets/Dahoas/rm-static
49 | class DahoasRmstaticDataset(PromptRawDataset):
50 |
51 | def __init__(self, output_path, seed, local_rank):
52 | super().__init__(output_path, seed, local_rank)
53 | self.dataset_name = "Dahoas/rm-static"
54 | self.dataset_name_clean = "Dahoas_rm_static"
55 | self.raw_datasets = load_dataset("Dahoas/rm-static")
56 |
57 | def get_train_data(self):
58 | return self.raw_datasets["train"]
59 |
60 | def get_eval_data(self):
61 | return self.raw_datasets["test"]
62 |
63 | def get_prompt(self, sample):
64 | return sample['prompt']
65 |
66 | def get_chosen(self, sample):
67 | return sample['chosen']
68 |
69 | def get_rejected(self, sample):
70 | return sample['rejected']
71 |
72 | def get_prompt_and_chosen(self, sample):
73 | return sample['prompt'] + sample['chosen']
74 |
75 | def get_prompt_and_rejected(self, sample):
76 | return sample['prompt'] + sample['rejected']
77 |
78 |
79 | # # Belleschool_math0.25K
80 | # class BelleOpenSoucreDataset(PromptRawDataset):
81 |
82 | # def __init__(self, output_path, seed, local_rank, data_file):
83 | # eval_data_file = "utils/data/dev1K.json"
84 | # super().__init__(output_path, seed, local_rank)
85 | # self.dataset_name = "BelleOpenSoucre"
86 | # self.dataset_name_clean = "BelleOpenSoucre"
87 | # dataset_cache_dir = "output/data_files"
88 | # print("data_file = ", data_file)
89 | # self.raw_datasets = load_dataset("json", data_files=data_file, cache_dir=dataset_cache_dir)
90 | # self.raw_datasets.cleanup_cache_files()
91 | # self.dev_raw_datasets = load_dataset("json", data_files=eval_data_file, cache_dir=dataset_cache_dir)
92 | # self.dev_raw_datasets.cleanup_cache_files()
93 | # print(self.raw_datasets["train"])
94 |
95 | # def get_train_data(self):
96 | # return self.raw_datasets["train"]
97 |
98 | # def get_eval_data(self):
99 | # return self.dev_raw_datasets["train"]
100 |
101 | # def get_prompt(self, sample):
102 | # return "Human: "+sample['instruction']+sample['input']+"\n Assistant: "
103 |
104 | # def get_chosen(self, sample):
105 | # return "Human: "+sample['instruction']+sample['input']+"\n Assistant: "
106 |
107 |
108 | # def get_prompt_and_chosen(self, sample):
109 | # return "Human: "+sample['instruction']+sample['input']+"\n Assistant: "+sample['output']
110 |
111 |
112 |
113 | class BelleOpenSoucreDataset(PromptRawDataset):
114 |
115 | def __init__(self, output_path, seed, local_rank, data_file, eval_data_file=None):
116 | '''
117 | {
118 | "id": "uniq_sample_id",
119 | "conversations": [
120 | {"from": "human", "value": "你好"},
121 | {"from": "assistant", "value": "你好,有什么可以帮助你的吗?"},
122 | {"from": "human", "value": "今天天气怎么样?"},
123 | {"from": "assistant", "value": "不好意思,我无法回答你的问题,因为我不知道你的位置信息,同时我目前还无法获取到最新的天气信息。"}
124 | ]
125 | }
126 | LlamaTokenizer会自动加上bos_token_id,但是BloomTokenizer不会加上bos_token_id
127 | 两个tokenizer的bos_token_id和eos_token_id是相同的,pad_token_id强制设置为0
128 | '''
129 | # eval_data_file = "utils/data/dev1K.json"
130 |
131 | super().__init__(output_path, seed, local_rank)
132 | self.dataset_name = "BelleOpenSoucre"
133 | self.dataset_name_clean = "BelleOpenSoucre"
134 | dataset_cache_dir = "output/data_files"
135 | print("data_file = ", data_file)
136 | self.raw_datasets = load_dataset("json", data_files=data_file, cache_dir=dataset_cache_dir)
137 | self.raw_datasets.cleanup_cache_files()
138 |
139 | if eval_data_file!=None and os.path.exists(eval_data_file):
140 | print("eval_data_file = ", eval_data_file)
141 | self.dev_raw_datasets = load_dataset("json", data_files=eval_data_file, cache_dir=dataset_cache_dir)
142 | self.dev_raw_datasets.cleanup_cache_files()
143 | self.train_data = self.raw_datasets["train"]
144 | self.eval_data = self.dev_raw_datasets["train"]
145 | else:
146 | train_val = self.raw_datasets["train"].train_test_split(
147 | test_size=1000, shuffle=True, seed=42
148 | )
149 | self.train_data = train_val["train"]
150 | self.eval_data = train_val["test"]
151 |
152 | print("train_data: ", self.train_data)
153 | print("eval_data: ", self.eval_data)
154 |
155 |
156 | def get_train_data(self):
157 | return self.train_data
158 |
159 | def get_eval_data(self):
160 | return self.eval_data
161 |
162 | def get_conversations(self, sample):
163 | return sample['conversations']
164 |
--------------------------------------------------------------------------------
/train/src/entry_point/interface.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from functools import partial
3 | import os
4 | import deepspeed
5 | import gradio as gr
6 | import torch
7 | from peft import PeftModel
8 | from transformers import (
9 | AutoConfig,
10 | AutoModelForCausalLM,
11 | AutoTokenizer,
12 | GenerationConfig,
13 | LlamaTokenizer,
14 | )
15 | import sys
16 | import traceback
17 | # import pudb
18 |
19 | # 异常时中断
20 | # def debug_on_exception(exctype, value, tb):
21 | # traceback.print_exception(exctype, value, tb)
22 | # pudb.post_mortem(tb)
23 |
24 |
25 | # sys.excepthook = debug_on_exception
26 |
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument("--local_rank", type=int, default=0)
29 | parser.add_argument("--model_name_or_path", type=str, required=True)
30 | parser.add_argument("--ckpt_path", type=str, required=True)
31 | parser.add_argument("--use_lora", action="store_true")
32 | parser.add_argument("--use_deepspeed", action="store_true")
33 | parser.add_argument("--llama", action="store_true")
34 | parser.add_argument("--base_port", default=17860, type=int)
35 | args = parser.parse_args()
36 |
37 |
38 | def generate_prompt(input_text):
39 | return input_text
40 |
41 |
42 | def evaluate(
43 | model,
44 | tokenizer,
45 | input: str,
46 | temperature=0.1, # 温度参数,用于控制生成文本的随机性。较高的温度值会导致更随机的输出,而较低的温度值会导致更确定性的输出
47 | top_p=0.75, # 在生成文本时,保留的可能性最高的tokens的累积概率
48 | top_k=40, # 仅考虑前top_k个可能的token来生成文本
49 | num_beams=4, # 束搜索(beam search)的数量。束搜索是一种用于生成文本的搜索策略,它可以帮助生成更流畅和一致的文本
50 | do_sample=False, # 表示是否使用采样方法生成文本。True,则会进行随机采样,否则将使用贪婪(greedy)解码
51 | max_new_tokens=128, # 生成的新tokens的最大数量。这个参数用于控制生成文本的长度
52 | min_new_tokens=1, # 生成的新tokens的最小数量。这个参数用于确保生成的文本不会太短
53 | repetition_penalty=1.2, # 重复惩罚参数,用于控制生成文本中重复tokens的惩罚程度
54 | **kwargs,
55 | ):
56 | prompt = generate_prompt(input)
57 | inputs = tokenizer(prompt, return_tensors="pt")
58 |
59 | input_ids = inputs["input_ids"].to(getattr(model, 'module', model).device)
60 |
61 | generation_config = GenerationConfig(
62 | temperature=temperature,
63 | top_p=top_p,
64 | top_k=top_k,
65 | num_beams=num_beams,
66 | bos_token_id=tokenizer.bos_token_id,
67 | eos_token_id=tokenizer.eos_token_id,
68 | pad_token_id=tokenizer.pad_token_id,
69 | max_new_tokens=max_new_tokens, # max_length=max_new_tokens+input_sequence
70 | min_new_tokens=min_new_tokens, # min_length=min_new_tokens+input_sequence
71 | repetition_penalty=repetition_penalty,
72 | do_sample=do_sample,
73 | **kwargs,
74 | )
75 | with torch.no_grad():
76 | # pudb.set_trace()
77 | generation_output = model.generate(
78 | input_ids=input_ids,
79 | generation_config=generation_config,
80 | return_dict_in_generate=True,
81 | output_scores=False
82 | )
83 | output = generation_output.sequences[0]
84 | output = (
85 | tokenizer.decode(output, skip_special_tokens=True)
86 | .strip()
87 | )[len(input):]
88 | return output
89 |
90 |
91 | if __name__ == "__main__":
92 | load_type = torch.float16 # Sometimes may need torch.float32
93 |
94 | if args.llama:
95 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path)
96 | else:
97 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
98 |
99 | tokenizer.pad_token_id = 0 # 将分词器的填充token的ID设置为0
100 | tokenizer.bos_token_id = 1 # 将分词器的开始token的ID设置为1
101 | tokenizer.eos_token_id = 2 # 将分词器的结束token的ID设置为2
102 | tokenizer.padding_side = "left" # 设置填充token位于左侧
103 | model_config = AutoConfig.from_pretrained(args.model_name_or_path)
104 |
105 | print(f"Rank {args.local_rank} loading model...") # "rank" 在计算机科学领域通常用于表示并行计算中的处理单元的标识或位置
106 |
107 | if args.use_lora:
108 | base_model = AutoModelForCausalLM.from_pretrained(
109 | args.model_name_or_path, torch_dtype=load_type, config=model_config
110 | )
111 | model = PeftModel.from_pretrained(
112 | base_model, args.ckpt_path, torch_dtype=load_type
113 | )
114 | else:
115 | model = AutoModelForCausalLM.from_pretrained(
116 | args.ckpt_path, torch_dtype=load_type, config=model_config
117 | )
118 |
119 | if not args.use_deepspeed:
120 | if torch.cuda.is_available():
121 | device = torch.device(f'cuda')
122 | else:
123 | device = torch.device('cpu')
124 | if device == torch.device('cpu'):
125 | model.float()
126 | print(f'device: {device}')
127 | model.to(device)
128 | model.eval()
129 | else:
130 | model = deepspeed.init_inference(
131 | model,
132 | mp_size=int(os.getenv("WORLD_SIZE", "1")),
133 | dtype=torch.half,
134 | checkpoint=None,
135 | replace_with_kernel_inject=True,
136 | )
137 |
138 | # model = None
139 |
140 | print("Load model successfully")
141 | # https://gradio.app/docs/ # Gradio是一个用于构建机器学习模型交互界面的Python库
142 | gr.Interface(
143 | fn=partial(evaluate, model, tokenizer),
144 | inputs=[
145 | gr.components.Textbox(
146 | lines=2, label="Input", placeholder="Welcome to the BELLE model"
147 | ),
148 | gr.components.Slider(minimum=0, maximum=1,
149 | value=0.1, label="Temperature"),
150 | gr.components.Slider(minimum=0, maximum=1,
151 | value=0.75, label="Top p"),
152 | gr.components.Slider(
153 | minimum=0, maximum=100, step=1, value=40, label="Top k"
154 | ),
155 | gr.components.Slider(
156 | minimum=1, maximum=4, step=1, value=1, label="Beams Number"
157 | ),
158 | gr.components.Checkbox(
159 | value=False,
160 | label="Do sample"
161 | ),
162 | gr.components.Slider(
163 | minimum=1, maximum=2000, step=10, value=512, label="Max New Tokens"
164 | ),
165 | gr.components.Slider(
166 | minimum=1, maximum=300, step=10, value=1, label="Min New Tokens"
167 | ),
168 | gr.components.Slider(
169 | minimum=1.0,
170 | maximum=2.0,
171 | step=0.1,
172 | value=1.2,
173 | label="Repetition Penalty",
174 | )
175 | ],
176 | outputs=[
177 | gr.inputs.Textbox(
178 | lines=25,
179 | label="Output",
180 | )
181 | ],
182 | title="BELLE: Be Everyone's Large Language model Engine",
183 | ).queue().launch(
184 | share=True, server_name="0.0.0.0", server_port=args.base_port + args.local_rank
185 | )
186 |
--------------------------------------------------------------------------------
/train/src/apply_delta.py:
--------------------------------------------------------------------------------
1 | #!python
2 | # -*- coding: utf-8 -*-
3 | # @author: Kun
4 |
5 | import argparse
6 | import gc
7 | import glob
8 | import json
9 | import os
10 | import shutil
11 | import tempfile
12 |
13 | from huggingface_hub import snapshot_download
14 | import torch
15 | from torch import nn
16 | from tqdm import tqdm
17 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
18 | from transformers import LlamaTokenizer, LlamaConfig, LlamaForCausalLM
19 |
20 | """
21 | Apply the delta weights on top of a base model.
22 |
23 | Usage:
24 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
25 | """
26 | PROXIES={
27 | 'http': os.environ.get('http_proxy', ''),
28 | 'https': os.environ.get('https_proxy', '')
29 | }
30 | GB = 1 << 30
31 |
32 | def split_files(model_path, tmp_path, split_size):
33 | if not os.path.exists(model_path):
34 | model_path = snapshot_download(repo_id=model_path)
35 | if not os.path.exists(tmp_path):
36 | os.makedirs(tmp_path)
37 |
38 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
39 | files = glob.glob(file_pattern)
40 |
41 | part = 0
42 | for file_path in tqdm(files):
43 | state_dict = torch.load(file_path)
44 | new_state_dict = {}
45 |
46 | current_size = 0
47 | try:
48 | for name, param in state_dict.items():
49 | param_size = param.numel() * param.element_size()
50 |
51 | if current_size + param_size > split_size:
52 | new_file_name = f"pytorch_model-{part}.bin"
53 | new_file_path = os.path.join(tmp_path, new_file_name)
54 | torch.save(new_state_dict, new_file_path)
55 | current_size = 0
56 | new_state_dict = None
57 | gc.collect()
58 | new_state_dict = {}
59 | part += 1
60 |
61 | new_state_dict[name] = param
62 | current_size += param_size
63 |
64 | new_file_name = f"pytorch_model-{part}.bin"
65 | new_file_path = os.path.join(tmp_path, new_file_name)
66 | torch.save(new_state_dict, new_file_path)
67 | new_state_dict = None
68 | gc.collect()
69 | new_state_dict = {}
70 | part += 1
71 | except Exception as e:
72 | print(f"An error occurred during split_files: {e}")
73 | shutil.rmtree(tmp_path)
74 | raise
75 |
76 |
77 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
78 | # base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)
79 | # base_config = AutoConfig.from_pretrained(base_model_path)
80 |
81 | base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False, proxies=PROXIES)
82 | base_config = LlamaConfig.from_pretrained(base_model_path, proxies=PROXIES)
83 |
84 | if os.path.exists(target_model_path):
85 | shutil.rmtree(target_model_path)
86 | os.makedirs(target_model_path)
87 |
88 | split_size = 4 * GB
89 |
90 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
91 | print(f"Split files for the base model to {tmp_base_path}")
92 | split_files(base_model_path, tmp_base_path, split_size)
93 | print(f"Split files for the delta model to {tmp_delta_path}")
94 | split_files(delta_path, tmp_delta_path, split_size)
95 |
96 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
97 | base_files = glob.glob(base_pattern)
98 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
99 | delta_files = glob.glob(delta_pattern)
100 | delta_state_dict = torch.load(delta_files[0])
101 |
102 | print("Applying the delta")
103 | weight_map = {}
104 | total_size = 0
105 |
106 | for i, base_file in tqdm(enumerate(base_files)):
107 | state_dict = torch.load(base_file)
108 | file_name = f"pytorch_model-{i}.bin"
109 | for name, param in state_dict.items():
110 | if name not in delta_state_dict:
111 | for delta_file in delta_files:
112 | delta_state_dict = torch.load(delta_file)
113 | gc.collect()
114 | if name in delta_state_dict:
115 | break
116 |
117 | state_dict[name] += delta_state_dict[name]
118 | weight_map[name] = file_name
119 | total_size += param.numel() * param.element_size()
120 | gc.collect()
121 | torch.save(state_dict, os.path.join(target_model_path, file_name))
122 |
123 | with open(
124 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
125 | ) as f:
126 | json.dump(
127 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
128 | )
129 |
130 | print(f"Saving the target model to {target_model_path}")
131 | base_tokenizer.save_pretrained(target_model_path)
132 | base_config.save_pretrained(target_model_path)
133 |
134 |
135 | def apply_delta(base_model_path, target_model_path, delta_path):
136 | print(f"Loading the base model from {base_model_path}")
137 | # base = AutoModelForCausalLM.from_pretrained(
138 | # base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
139 | # )
140 | # base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)
141 | base = LlamaForCausalLM.from_pretrained(
142 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, proxies=PROXIES
143 | )
144 | base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False, proxies=PROXIES)
145 |
146 | print(f"Loading the delta from {delta_path}")
147 | delta = AutoModelForCausalLM.from_pretrained(
148 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, proxies=PROXIES
149 | )
150 |
151 | print("Applying the delta")
152 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
153 | assert name in delta.state_dict()
154 | param.data += delta.state_dict()[name]
155 |
156 | print(f"Saving the target model to {target_model_path}")
157 | base.save_pretrained(target_model_path)
158 | base_tokenizer.save_pretrained(target_model_path)
159 |
160 |
161 | if __name__ == "__main__":
162 | parser = argparse.ArgumentParser()
163 | parser.add_argument("--base-model-path", type=str, required=True)
164 | parser.add_argument("--target-model-path", type=str, required=True)
165 | parser.add_argument("--delta-path", type=str, required=True)
166 | parser.add_argument(
167 | "--low-cpu-mem",
168 | action="store_true",
169 | help="Lower the cpu memory usage. This will split large files and use "
170 | "disk as swap to reduce the memory usage below 10GB.",
171 | )
172 | args = parser.parse_args()
173 |
174 | if args.low_cpu_mem:
175 | apply_delta_low_cpu_mem(
176 | args.base_model_path, args.target_model_path, args.delta_path
177 | )
178 | else:
179 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
180 |
--------------------------------------------------------------------------------
/train/src/trainer.py:
--------------------------------------------------------------------------------
1 | from peft import PeftModel
2 | from transformers.trainer import *
3 |
4 | from src.utils import get_ds_state_dict
5 |
6 |
7 | class MyTrainer(Trainer):
8 | def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
9 | """
10 | Add supports for peft + deepspeed zero 3
11 |
12 | Will save the model, so you can reload it using `from_pretrained()`.
13 |
14 | Will only save from the main process.
15 | """
16 |
17 | if output_dir is None:
18 | output_dir = self.args.output_dir
19 |
20 | if is_torch_tpu_available():
21 | self._save_tpu(output_dir)
22 | elif is_sagemaker_mp_enabled():
23 | # Calling the state_dict needs to be done on the wrapped model and on all processes.
24 | os.makedirs(output_dir, exist_ok=True)
25 | state_dict = self.model_wrapped.state_dict()
26 | if self.args.should_save:
27 | self._save(output_dir, state_dict=state_dict)
28 | if IS_SAGEMAKER_MP_POST_1_10:
29 | # 'user_content.pt' indicates model state_dict saved with smp >= 1.10
30 | Path(os.path.join(output_dir, "user_content.pt")).touch()
31 | elif (
32 | ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp
33 | or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp
34 | or self.fsdp is not None
35 | or self.is_fsdp_enabled
36 | ):
37 | if self.is_fsdp_enabled:
38 | os.makedirs(output_dir, exist_ok=True)
39 | self.accelerator.state.fsdp_plugin.save_model(self.accelerator, self.model, output_dir)
40 | else:
41 | state_dict = self.model.state_dict()
42 |
43 | if self.args.should_save:
44 | self._save(output_dir, state_dict=state_dict)
45 | elif self.is_deepspeed_enabled:
46 | # This must be called on all ranks in stage 3
47 | if is_deepspeed_zero3_enabled():
48 | state_dict = get_ds_state_dict(self.deepspeed)
49 | else:
50 | # Only run on rank 0 except stage 3
51 | if self.args.should_save:
52 | state_dict = get_ds_state_dict(self.deepspeed)
53 | # this takes care of everything as long as we aren't under zero3
54 | # Only run on rank 0
55 | if self.args.should_save:
56 | # state_dict is available on rank 0
57 | self._save(output_dir, state_dict=state_dict)
58 |
59 | elif self.args.should_save:
60 | self._save(output_dir)
61 |
62 | # Push to the Hub when `save_model` is called by the user.
63 | if self.args.push_to_hub and not _internal_call:
64 | self.push_to_hub(commit_message="Model save")
65 |
66 | def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
67 | """
68 | Add supports for peft resume
69 | """
70 | if model is None:
71 | model = self.model
72 |
73 | config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME)
74 |
75 | weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME)
76 | weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
77 | adapter_model_path = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME)
78 | safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
79 | safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
80 | safe_adapter_model_path = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
81 |
82 | if not any(
83 | os.path.isfile(f) for f in [weights_file, safe_weights_file, adapter_model_path, weights_index_file, safe_weights_index_file, safe_adapter_model_path]
84 | ):
85 | raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
86 |
87 | logger.info(f"Loading model from {resume_from_checkpoint}.")
88 |
89 | if os.path.isfile(config_file):
90 | config = PretrainedConfig.from_json_file(config_file)
91 | checkpoint_version = config.transformers_version
92 | if checkpoint_version is not None and checkpoint_version != __version__:
93 | logger.warning(
94 | f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
95 | f"Transformers but your current version is {__version__}. This is not recommended and could "
96 | "yield to errors or unwanted behaviors."
97 | )
98 |
99 | if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or \
100 | os.path.isfile(adapter_model_path) or os.path.isfile(safe_adapter_model_path):
101 | # If the model is on the GPU, it still works!
102 | if is_sagemaker_mp_enabled():
103 | if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")):
104 | # If the 'user_content.pt' file exists, load with the new smp api.
105 | # Checkpoint must have been saved with the new smp api.
106 | smp.resume_from_checkpoint(
107 | path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False
108 | )
109 | else:
110 | # If the 'user_content.pt' file does NOT exist, load with the old smp api.
111 | # Checkpoint must have been saved with the old smp api.
112 | if hasattr(self.args, "fp16") and self.args.fp16 is True:
113 | logger.warning(
114 | "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
115 | )
116 | state_dict = torch.load(weights_file, map_location="cpu")
117 | # Required for smp to not auto-translate state_dict from hf to smp (is already smp).
118 | state_dict["_smp_is_partial"] = False
119 | load_result = model.load_state_dict(state_dict, strict=True)
120 | # release memory
121 | del state_dict
122 | elif self.is_fsdp_enabled:
123 | self.accelerator.state.fsdp_plugin.load_model(self.accelerator, model, resume_from_checkpoint)
124 | else:
125 | if is_peft_available() and isinstance(model, PeftModel):
126 | model.load_adapter(resume_from_checkpoint, getattr(model, "active_adapter", "default"), is_trainable=True)
127 | else:
128 | # We load the model state dict on the CPU to avoid an OOM error.
129 | if self.args.save_safetensors and os.path.isfile(safe_weights_file):
130 | state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
131 | else:
132 | state_dict = torch.load(weights_file, map_location="cpu")
133 |
134 | # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
135 | # which takes *args instead of **kwargs
136 | load_result = model.load_state_dict(state_dict, False)
137 | # release memory
138 | del state_dict
139 | self._issue_warnings_after_load(load_result)
140 | else:
141 | # We load the sharded checkpoint
142 | load_result = load_sharded_checkpoint(
143 | model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors
144 | )
145 | if not is_sagemaker_mp_enabled():
146 | self._issue_warnings_after_load(load_result)
147 |
--------------------------------------------------------------------------------
/train/scripts/convert_llama_weights_to_hf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import argparse
15 | import gc
16 | import json
17 | import math
18 | import os
19 | import shutil
20 |
21 | import torch
22 |
23 | from transformers import LlamaConfig, LlamaForCausalLM
24 |
25 |
26 | """
27 | Sample usage:
28 |
29 | ```
30 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \
31 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
32 | ```
33 |
34 | Thereafter, models can be loaded via:
35 |
36 | ```py
37 | from transformers import LlamaForCausalLM, LlamaForTokenizer
38 |
39 | model = LlamaForCausalLM.from_pretrained("/output/path")
40 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
41 | ```
42 |
43 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
44 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
45 | """
46 |
47 | INTERMEDIATE_SIZE_MAP = {
48 | "7B": 11008,
49 | "13B": 13824,
50 | "33B": 17920,
51 | "65B": 22016,
52 | }
53 | NUM_SHARDS = {
54 | "7B": 1,
55 | "13B": 2,
56 | "33B": 4,
57 | "65B": 8,
58 | }
59 |
60 |
61 | def compute_intermediate_size(n):
62 | return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
63 |
64 |
65 | def read_json(path):
66 | with open(path, "r") as f:
67 | return json.load(f)
68 |
69 |
70 | def write_json(text, path):
71 | with open(path, "w") as f:
72 | json.dump(text, f)
73 |
74 |
75 | def write_model(model_path, input_base_path, model_size):
76 | os.makedirs(model_path, exist_ok=True)
77 | tmp_model_path = os.path.join(model_path, "tmp")
78 | os.makedirs(tmp_model_path, exist_ok=True)
79 |
80 | params = read_json(os.path.join(input_base_path, "params.json"))
81 | num_shards = NUM_SHARDS[model_size]
82 | n_layers = params["n_layers"]
83 | n_heads = params["n_heads"]
84 | n_heads_per_shard = n_heads // num_shards
85 | dim = params["dim"]
86 | dims_per_head = dim // n_heads
87 | base = 10000.0
88 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
89 |
90 | # permute for sliced rotary
91 | def permute(w):
92 | return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
93 |
94 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
95 | # Load weights
96 | if model_size == "7B":
97 | # Not shared
98 | # (The sharded implementation would also work, but this is simpler.)
99 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
100 | else:
101 | # Sharded
102 | loaded = [
103 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
104 | for i in range(num_shards)
105 | ]
106 | param_count = 0
107 | index_dict = {"weight_map": {}}
108 | for layer_i in range(n_layers):
109 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
110 | if model_size == "7B":
111 | # Unsharded
112 | state_dict = {
113 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
114 | loaded[f"layers.{layer_i}.attention.wq.weight"]
115 | ),
116 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
117 | loaded[f"layers.{layer_i}.attention.wk.weight"]
118 | ),
119 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
120 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
121 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
122 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
123 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
124 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
125 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
126 | }
127 | else:
128 | # Sharded
129 | # Note that in the 13B checkpoint, not cloning the two following weights will result in the checkpoint
130 | # becoming 37GB instead of 26GB for some reason.
131 | state_dict = {
132 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
133 | f"layers.{layer_i}.attention_norm.weight"
134 | ].clone(),
135 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
136 | f"layers.{layer_i}.ffn_norm.weight"
137 | ].clone(),
138 | }
139 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
140 | torch.cat(
141 | [
142 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
143 | for i in range(num_shards)
144 | ],
145 | dim=0,
146 | ).reshape(dim, dim)
147 | )
148 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
149 | torch.cat(
150 | [
151 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
152 | for i in range(num_shards)
153 | ],
154 | dim=0,
155 | ).reshape(dim, dim)
156 | )
157 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
158 | [
159 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
160 | for i in range(num_shards)
161 | ],
162 | dim=0,
163 | ).reshape(dim, dim)
164 |
165 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
166 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
167 | )
168 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
169 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
170 | )
171 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
172 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
173 | )
174 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
175 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
176 | )
177 |
178 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
179 | for k, v in state_dict.items():
180 | index_dict["weight_map"][k] = filename
181 | param_count += v.numel()
182 | torch.save(state_dict, os.path.join(tmp_model_path, filename))
183 |
184 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
185 | if model_size == "7B":
186 | # Unsharded
187 | state_dict = {
188 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
189 | "model.norm.weight": loaded["norm.weight"],
190 | "lm_head.weight": loaded["output.weight"],
191 | }
192 | else:
193 | state_dict = {
194 | "model.norm.weight": loaded[0]["norm.weight"],
195 | "model.embed_tokens.weight": torch.cat(
196 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
197 | ),
198 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
199 | }
200 |
201 | for k, v in state_dict.items():
202 | index_dict["weight_map"][k] = filename
203 | param_count += v.numel()
204 | torch.save(state_dict, os.path.join(tmp_model_path, filename))
205 |
206 | # Write configs
207 | index_dict["metadata"] = {"total_size": param_count * 2}
208 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
209 |
210 | config = LlamaConfig(
211 | hidden_size=dim,
212 | intermediate_size=compute_intermediate_size(dim),
213 | num_attention_heads=params["n_heads"],
214 | num_hidden_layers=params["n_layers"],
215 | rms_norm_eps=params["norm_eps"],
216 | )
217 | config.save_pretrained(tmp_model_path)
218 |
219 | # Make space so we can load the model properly now.
220 | del state_dict
221 | del loaded
222 | gc.collect()
223 |
224 | print("Loading the checkpoint in a Llama model.")
225 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
226 | # Avoid saving this as part of the config.
227 | del model.config._name_or_path
228 |
229 | print("Saving in the Transformers format.")
230 | model.save_pretrained(model_path)
231 | shutil.rmtree(tmp_model_path)
232 |
233 |
234 | def write_tokenizer(tokenizer_path, input_tokenizer_path):
235 | print(f"Fetching the tokenizer from {input_tokenizer_path}.")
236 | os.makedirs(tokenizer_path, exist_ok=True)
237 | write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json"))
238 | write_json(
239 | {
240 | "bos_token": "",
241 | "eos_token": "",
242 | "model_max_length": int(1e30),
243 | "tokenizer_class": "LlamaTokenizer",
244 | "unk_token": "",
245 | },
246 | os.path.join(tokenizer_path, "tokenizer_config.json"),
247 | )
248 | shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
249 |
250 |
251 | def main():
252 | parser = argparse.ArgumentParser()
253 | parser.add_argument(
254 | "--input_dir",
255 | help="Location of LLaMA weights, which contains tokenizer.model and model folders",
256 | )
257 | parser.add_argument(
258 | "--model_size",
259 | choices=["7B", "13B", "33B", "65B", "tokenizer_only"],
260 | )
261 | parser.add_argument(
262 | "--output_dir",
263 | help="Location to write HF model and tokenizer",
264 | )
265 | args = parser.parse_args()
266 | if args.model_size != "tokenizer_only":
267 | write_model(
268 | model_path=args.output_dir,
269 | input_base_path=os.path.join(args.input_dir, args.model_size),
270 | model_size=args.model_size,
271 | )
272 | write_tokenizer(
273 | tokenizer_path=args.output_dir,
274 | input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
275 | )
276 |
277 |
278 | if __name__ == "__main__":
279 | main()
--------------------------------------------------------------------------------
/train/dschat_train_v1/training_scripts/convert_llama_weights_to_hf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import argparse
15 | import gc
16 | import json
17 | import math
18 | import os
19 | import shutil
20 |
21 | import torch
22 |
23 | from transformers import LlamaConfig, LlamaForCausalLM
24 |
25 |
26 | """
27 | Sample usage:
28 |
29 | ```
30 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \
31 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
32 | ```
33 |
34 | Thereafter, models can be loaded via:
35 |
36 | ```py
37 | from transformers import LlamaForCausalLM, LlamaForTokenizer
38 |
39 | model = LlamaForCausalLM.from_pretrained("/output/path")
40 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
41 | ```
42 |
43 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
44 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
45 | """
46 |
47 | INTERMEDIATE_SIZE_MAP = {
48 | "7B": 11008,
49 | "13B": 13824,
50 | "33B": 17920,
51 | "65B": 22016,
52 | }
53 | NUM_SHARDS = {
54 | "7B": 1,
55 | "13B": 2,
56 | "33B": 4,
57 | "65B": 8,
58 | }
59 |
60 |
61 | def compute_intermediate_size(n):
62 | return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
63 |
64 |
65 | def read_json(path):
66 | with open(path, "r") as f:
67 | return json.load(f)
68 |
69 |
70 | def write_json(text, path):
71 | with open(path, "w") as f:
72 | json.dump(text, f)
73 |
74 |
75 | def write_model(model_path, input_base_path, model_size):
76 | os.makedirs(model_path, exist_ok=True)
77 | tmp_model_path = os.path.join(model_path, "tmp")
78 | os.makedirs(tmp_model_path, exist_ok=True)
79 |
80 | params = read_json(os.path.join(input_base_path, "params.json"))
81 | num_shards = NUM_SHARDS[model_size]
82 | n_layers = params["n_layers"]
83 | n_heads = params["n_heads"]
84 | n_heads_per_shard = n_heads // num_shards
85 | dim = params["dim"]
86 | dims_per_head = dim // n_heads
87 | base = 10000.0
88 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
89 |
90 | # permute for sliced rotary
91 | def permute(w):
92 | return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
93 |
94 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
95 | # Load weights
96 | if model_size == "7B":
97 | # Not shared
98 | # (The sharded implementation would also work, but this is simpler.)
99 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
100 | else:
101 | # Sharded
102 | loaded = [
103 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
104 | for i in range(num_shards)
105 | ]
106 | param_count = 0
107 | index_dict = {"weight_map": {}}
108 | for layer_i in range(n_layers):
109 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
110 | if model_size == "7B":
111 | # Unsharded
112 | state_dict = {
113 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
114 | loaded[f"layers.{layer_i}.attention.wq.weight"]
115 | ),
116 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
117 | loaded[f"layers.{layer_i}.attention.wk.weight"]
118 | ),
119 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
120 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
121 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
122 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
123 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
124 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
125 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
126 | }
127 | else:
128 | # Sharded
129 | # Note that in the 13B checkpoint, not cloning the two following weights will result in the checkpoint
130 | # becoming 37GB instead of 26GB for some reason.
131 | state_dict = {
132 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
133 | f"layers.{layer_i}.attention_norm.weight"
134 | ].clone(),
135 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
136 | f"layers.{layer_i}.ffn_norm.weight"
137 | ].clone(),
138 | }
139 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
140 | torch.cat(
141 | [
142 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
143 | for i in range(num_shards)
144 | ],
145 | dim=0,
146 | ).reshape(dim, dim)
147 | )
148 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
149 | torch.cat(
150 | [
151 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
152 | for i in range(num_shards)
153 | ],
154 | dim=0,
155 | ).reshape(dim, dim)
156 | )
157 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
158 | [
159 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
160 | for i in range(num_shards)
161 | ],
162 | dim=0,
163 | ).reshape(dim, dim)
164 |
165 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
166 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
167 | )
168 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
169 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
170 | )
171 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
172 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
173 | )
174 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
175 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
176 | )
177 |
178 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
179 | for k, v in state_dict.items():
180 | index_dict["weight_map"][k] = filename
181 | param_count += v.numel()
182 | torch.save(state_dict, os.path.join(tmp_model_path, filename))
183 |
184 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
185 | if model_size == "7B":
186 | # Unsharded
187 | state_dict = {
188 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
189 | "model.norm.weight": loaded["norm.weight"],
190 | "lm_head.weight": loaded["output.weight"],
191 | }
192 | else:
193 | state_dict = {
194 | "model.norm.weight": loaded[0]["norm.weight"],
195 | "model.embed_tokens.weight": torch.cat(
196 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
197 | ),
198 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
199 | }
200 |
201 | for k, v in state_dict.items():
202 | index_dict["weight_map"][k] = filename
203 | param_count += v.numel()
204 | torch.save(state_dict, os.path.join(tmp_model_path, filename))
205 |
206 | # Write configs
207 | index_dict["metadata"] = {"total_size": param_count * 2}
208 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
209 |
210 | config = LlamaConfig(
211 | hidden_size=dim,
212 | intermediate_size=compute_intermediate_size(dim),
213 | num_attention_heads=params["n_heads"],
214 | num_hidden_layers=params["n_layers"],
215 | rms_norm_eps=params["norm_eps"],
216 | )
217 | config.save_pretrained(tmp_model_path)
218 |
219 | # Make space so we can load the model properly now.
220 | del state_dict
221 | del loaded
222 | gc.collect()
223 |
224 | print("Loading the checkpoint in a Llama model.")
225 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
226 | # Avoid saving this as part of the config.
227 | del model.config._name_or_path
228 |
229 | print("Saving in the Transformers format.")
230 | model.save_pretrained(model_path)
231 | shutil.rmtree(tmp_model_path)
232 |
233 |
234 | def write_tokenizer(tokenizer_path, input_tokenizer_path):
235 | print(f"Fetching the tokenizer from {input_tokenizer_path}.")
236 | os.makedirs(tokenizer_path, exist_ok=True)
237 | write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json"))
238 | write_json(
239 | {
240 | "bos_token": "",
241 | "eos_token": "",
242 | "model_max_length": int(1e30),
243 | "tokenizer_class": "LlamaTokenizer",
244 | "unk_token": "",
245 | },
246 | os.path.join(tokenizer_path, "tokenizer_config.json"),
247 | )
248 | shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
249 |
250 |
251 | def main():
252 | parser = argparse.ArgumentParser()
253 | parser.add_argument(
254 | "--input_dir",
255 | help="Location of LLaMA weights, which contains tokenizer.model and model folders",
256 | )
257 | parser.add_argument(
258 | "--model_size",
259 | choices=["7B", "13B", "33B", "65B", "tokenizer_only"],
260 | )
261 | parser.add_argument(
262 | "--output_dir",
263 | help="Location to write HF model and tokenizer",
264 | )
265 | args = parser.parse_args()
266 | if args.model_size != "tokenizer_only":
267 | write_model(
268 | model_path=args.output_dir,
269 | input_base_path=os.path.join(args.input_dir, args.model_size),
270 | model_size=args.model_size,
271 | )
272 | write_tokenizer(
273 | tokenizer_path=args.output_dir,
274 | input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
275 | )
276 |
277 |
278 | if __name__ == "__main__":
279 | main()
--------------------------------------------------------------------------------
/train/dschat_train_v1/README.md:
--------------------------------------------------------------------------------
1 | # 项目介绍
2 |
3 | 本目录代码基于[Deepspeed-Chat](https://github.com/microsoft/DeepSpeedExamples)项目,可用于微调大语言模型,包括全量参数微调(fine-tuning)和基于LoRA的参数高效微调。
4 |
5 | ## 1. 准备环境
6 |
7 | 我们提供了一个完整可运行的Docker环境,Dockerfile写在docker文件夹下.
8 |
9 | 考虑到build存在一定的困难,我们提供了镜像下载,你可以使用下面命令从dockerhub拉取我们的镜像,然后在镜像中运行代码。
10 |
11 | ```shell
12 | docker pull belleagi/belle:v1.0
13 | git clone https://github.com/LianjiaTech/BELLE.git
14 | docker run -it --runtime=nvidia --shm-size="40g" -v /path/BELLE/train:/workspace/BELLE-train -v /path/huggingface_models/:/workspace/model_name_or_path belleagi/belle:v1.0 /bin/bash
15 | ```
16 |
17 | 上述命令实现了以下几点:
18 |
19 | 1. 拉取docker镜像
20 | 2. clone BELLE仓库
21 | 3. 将BELLE/train目录与Docker环境中/workspace/BELLE-train目录挂载
22 | 4. 将huggingface_models目录与Docker环境中/workspace/model_name_or_path目录挂载。其中huggingface_models代表预训练模型的保存路径,该目录下存放所有需要的预训练语言模型,如llama-7b, bloomz-7b1-mt等
23 | 5. 注意:上述挂载的目录必须是绝对路径
24 |
25 | ## 2. Run
26 |
27 | ### 2.1 Download data
28 |
29 | ```bash
30 | wget https://huggingface.co/datasets/BelleGroup/school_math_0.25M/resolve/main/school_math_0.25M.json
31 | ```
32 |
33 | 该数据是数学应用题求解任务,包含解题过程,共计约25万条。示例:
34 |
35 | | Instruction | Answer |
36 | | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
37 | | 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天? | 一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可: 25 ÷ 9 = 2余7 也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。 因此,一箱鸡蛋能够供小明吃2天。 |
38 | | 小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果? | 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。 2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 计算过程如下: 15 - 6 = 9 所以,小明还剩下9颗糖果。 |
39 |
40 | #### 2.1.1 Prepare data
41 |
42 | ```bash
43 | python training_scripts/convert_to_conv_data.py --orig_data school_math_0.25M.json --write_data school_math_0.25M_conv.json --dataset_name bellemath
44 | head -n 1000 school_math_0.25M_conv.json > belleMath-dev1K.json
45 | tail -n +1001 school_math_0.25M_conv.json > belleMath.json
46 | ```
47 |
48 | 我们选取前1000条作为验证集,其余数据作为训练集
49 |
50 | 我们会在Instruction的开头和结尾加上Human和Assistant作为模型的输入,形如:
51 |
52 | | Instruction |
53 | | -------------------------------------------------------------------------------------------------------------------- |
54 | | Human: 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天?\n\nAssistant: |
55 | | Human: 小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果?\n\nAssistant: |
56 |
57 | **注:**目前仓库的代码要兼容单轮和多轮数据,因此对于之前的开源数据(格式为(instruction,output)或者(input,target)。目前绝大部分开源的单轮的instruction数据均为这种格式),需要用convert_to_conv_data.py脚本转换为如下的统一格式:
58 |
59 | ```bash
60 | {"id": "bellemath42", "conversations": [{"from": "human", "value": " 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天?"}, {"from": "assistant", "value": "\n一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可:\n25 ÷ 9 = 2余7\n也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。\n因此,一箱鸡蛋能够供小明吃2天。"}]}
61 | {"id": "bellemath80", "conversations": [{"from": "human", "value": "小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果?"}, {"from": "assistant", "value": " \n1. 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。\n2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 \n计算过程如下:\n15 - 6 = 9\n所以,小明还剩下9颗糖果。"}]}
62 | ```
63 |
64 | 其他的训练数据见:https://huggingface.co/BelleGroup 按照上述流程转换格式即可。
65 |
66 | #### 2.1.2 Multi-turn data
67 |
68 | [shareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main) 是一个开源的大规模的多轮对话数据,具体效果可参考我们的工作:[Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://arxiv.org/pdf/2304.07854.pdf)
69 |
70 | 当前代码已支持训练这种多轮对话数据。数据下载:
71 |
72 | ```bash
73 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
74 | mv ShareGPT_V3_unfiltered_cleaned_split.json shareGPT.json
75 | ```
76 |
77 | ### 2.2 Train
78 |
79 | 目前支持单机单卡和单机多卡的训练。不同于 [Deepspeed-Chat ](https://github.com/microsoft/DeepSpeedExamples),我们仅针对stage1,也就是SFT阶段(具体来说是instruction-tuning)。
80 |
81 | #### 2.2.1 单机多卡训练
82 |
83 | ##### Fine-Tuning
84 |
85 | 如果要实现单机多卡微调,仅需要运行如下命令
86 |
87 | ```bash
88 | bash training_scripts/single_node/run_FT.sh output 2
89 | ```
90 |
91 | - output 代表数据和模型保存的路径,如果没有则会创建。
92 | - 2 代表zero_stage
93 |
94 | 具体启动命令和参数配置如下:
95 |
96 | ```bash
97 | deepspeed main.py \
98 | --sft_only_data_path belleMath.json \
99 | --eval_data_file belleMath-dev1K.json \
100 | --model_name_or_path /workspace/model_name_or_path/hf_llama_7b \
101 | --per_device_train_batch_size 1 \
102 | --per_device_eval_batch_size 1 \
103 | --max_seq_len 1024 \
104 | --learning_rate 5e-7 \
105 | --weight_decay 0.0001 \
106 | --num_train_epochs 1 \
107 | --gradient_accumulation_steps 4 \
108 | --lr_scheduler_type cosine \
109 | --num_warmup_steps 100 \
110 | --seed 1234 \
111 | --gradient_checkpointing \
112 | --zero_stage $ZERO_STAGE \
113 | --deepspeed \
114 | --output_dir $OUTPUT \
115 | --data_output_path $data_output_path \
116 | ```
117 |
118 | - sft_only_data_path 就是训练集数据。如果想换成shareGPT,仅需替换为shareGPT.json
119 | - eval_data_file 代表验证集数据,如果没有预先划分出训练和验证数据,可以不指定该参数,此时将会从训练数据中随机抽取1000条作为验证数据
120 | - model_name_or_path就是基础模型。我们建议基于我们开源的模型(如:[BelleGroup/BELLE-LLaMA-EXT-7B](https://huggingface.co/BelleGroup/BELLE-LLaMA-EXT-7B)) 作为基础模型进行进一步微调,这样仅需要少量训练数据和训练轮次即可微调一个效果不错的模型。
121 | - zero_stage。可优先设置为1或者2,如果显存不足,设置为3。关于zero-stage的详细介绍可参考: https://www.deepspeed.ai/tutorials/zero/
122 |
123 | ##### LoRA
124 |
125 | 如果要实现单机多卡LoRA-based tuning,需要运行如下命令:
126 |
127 | ```bash
128 | bash training_scripts/single_node/run_LoRA.sh output-lora 2
129 | ```
130 |
131 | - output 代表数据和模型保存的路径,如果没有则会创建。
132 | - 2 代表zero_stage
133 |
134 | 具体启动命令和参数配置如下:
135 |
136 | ```bash
137 | model_name_or_path=/workspace/model_name_or_path/hf_llama_7b
138 | lora_module_name="q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj"
139 | echo ${lora_module_name}
140 |
141 | deepspeed main.py \
142 | --sft_only_data_path belleMath.json \
143 | --eval_data_file belleMath-dev1K.json \
144 | --data_split 10,0,0 \
145 | --model_name_or_path ${model_name_or_path} \
146 | --per_device_train_batch_size 16 \
147 | --per_device_eval_batch_size 1 \
148 | --max_seq_len 1024 \
149 | --learning_rate 3e-4 \
150 | --weight_decay 0. \
151 | --num_train_epochs 1 \
152 | --gradient_accumulation_steps 1 \
153 | --lr_scheduler_type cosine \
154 | --num_warmup_steps 100 \
155 | --seed 1234 \
156 | --gradient_checkpointing \
157 | --zero_stage $ZERO_STAGE \
158 | --lora_dim 16 \
159 | --lora_alpha 16 \
160 | --lora_droppout 0.05 \
161 | --lora_module_name ${lora_module_name} \
162 | --deepspeed \
163 | --output_dir $OUTPUT_PATH \
164 | ```
165 |
166 | - lora_module_name代表LoRA需要adapt的参数,我们的实验设置是attention+MLP的参数。不同的预训练模型的权重名称不一样,比如对于Bloom模型,对应的attention权重的名称是query_key_value,此时lora_module_name可以改为"query_key_value,mlp"
167 | - lora_dim、lora_alpha、lora_droppout均为LoRA训练的超参数
168 |
169 | #### 2.2.2 单机单卡训练
170 |
171 | ##### Fine-Tuning
172 |
173 | 如果要实现单机单卡微调,仅需要运行如下命令
174 |
175 | ```bash
176 | bash training_scripts/single_gpu/run_FT.sh output 3
177 | ```
178 |
179 | 其余配置与上述内容一致。
180 |
181 | ##### LoRA
182 |
183 | 如果要实现单机单卡LoRA-based tuning,需要运行如下命令:
184 |
185 | ```bash
186 | bash training_scripts/single_gpu/run_LoRA.sh output-lora 3
187 | ```
188 |
189 | 其余配置与上述内容一致。
190 |
191 | 如果出现显存不足的情况,需要调整per_device_train_batch_size、max_seq_len、zero_stage三个参数。另外可参考[Deepspeed-Chat-training_scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts) 中各个启动脚本内的参数配置
192 |
193 | 其余参数说明详见:https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md
194 |
195 | **注:**
196 |
197 | - 如果是单轮instruction数据,比如 [BELLE-2M](https://huggingface.co/datasets/BelleGroup/train_2M_CN) 等。对于Bloom模型,建议max_seq_len设置为512-1024之间。而对于LLaMA模型,max_seq_len尽可能不要低于1024。避免切割出太多不完整的句子,不利于模型学习。
198 | - 如果是多轮对话数据,比如 [shareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main),max_seq_len建议设置为2048
199 | - 运行脚本中各个参数的设置需要按实际情况进行调整,下面给出一些参数值供参考:
200 | - 对于learning_rate,FT训练时设置为5e-6量级,LoRA训练时设置为3e-4量级
201 | - 对于epoch,FT训练时设置为2或3,LoRA训练时设置为4或5
202 |
203 | ### 2.3 Generate
204 |
205 | 现在我们加载训练好的模型,验证模型生成文本的效果。
206 |
207 | ```bash
208 | CUDA_VISIBLE_DEVICES=0 python prompt_eval.py \
209 | --model_name_or_path model_name_or_path \
210 | --finetuned_model_name_or_path finetuned_model_name_or_path \
211 | --test_file test_file_name_or_path
212 | ```
213 |
214 | 参数说明:
215 |
216 | - model_name_or_path 是原生预训练模型的路径
217 | - finetuned_model_name_or_path 是训练后保存的模型
218 | - test_file就是验证集数据,默认路径是belleMath-dev1K.json
219 |
220 | 举例:
221 |
222 | ```bash
223 | CUDA_VISIBLE_DEVICES=0 python prompt_eval.py \
224 | --model_name_or_path /workspace/model_name_or_path/hf_llama_7b \
225 | --finetuned_model_name_or_path output-lora \
226 | --test_file belleMath-dev1K.json
227 | ```
228 |
229 | 模型生成的数据保存在predictions.json文件中
230 |
231 | 这里给出一些预测的示例:
232 |
233 | ```bash
234 | {"instruction": "小明有12个橙子,他想把它们分给他的4个朋友,每人分到的橙子数量相同,每人能分到几个橙子?", "generate_text": "Human: 小明有12个橙子,他想把它们分给他的4个朋友,每人分到的橙子数量相同,每人能分到几个橙子?\nAssistant: 将12个橙子平均分成4份,可以用除法来解决。首先,我们需要计算出每一份应该包含多少个橙子:\n12÷4=3\n因此,每个朋友都会得到3个橙子。答案是每人能分到3个橙子。"}
235 | {"instruction": "以下是一道小学数学题:小明家里有 3 只宠物猫和 2 只宠物狗,小花家里有 4 只宠物猫和 1 只宠物狗,谁家里宠物更多?", "generate_text": "Human: 以下是一道小学数学题:小明家里有 3 只宠物猫和 2 只宠物狗,小花家里有 4 只宠物猫和 1 只宠物狗,谁家里宠物更多?\nAssistant: 首先,我们需要计算出两个人的宠物总数。小明家里共有 5 只宠物(3 + 2 = 5);小花家里共有 6 只宠物(4 + 1 = 5)。因此,两个人的宠物数量相同。所以,无法确定谁家里宠物更多。"}
236 | {"instruction": "题目:小明有5个球,他送给小红2个球,还剩多少个球?", "generate_text": "Human: 题目:小明有5个球,他送给小红2个球,还剩多少个球?\nAssistant: 小明原来有5个球,送出去了2个球后,就是5-2=3。所以小明现在手里只剩下3个球了。答案为3。"}
237 | {"instruction": "请问2+3等于几?", "generate_text": "Human: 请问2+3等于几?\nAssistant: 将两个数字相加,得到答案为5。因此,2+3=5。"}
238 | ```
239 |
240 | generate方法的参数配置如下:
241 |
242 | ```bash
243 | max_new_tokens = 1024
244 | generation_config = dict(
245 | temperature=0.001,
246 | top_k=30,
247 | top_p=0.85,
248 | do_sample=True,
249 | num_beams=1,
250 | repetition_penalty=1.2,
251 | max_new_tokens=max_new_tokens
252 | )
253 | ```
254 |
255 | ## 3. Additional Notes
256 |
257 | ### 3.1 LLaMA模型的使用
258 |
259 | #### 3.1.1 facebook官方LLaMA权重转为hf格式
260 |
261 | 首先,您需要从[facebookresearch/llama](https://github.com/facebookresearch/llama)获取LLaMA模型的访问权限,下载官方检查点
262 |
263 | ```bash
264 | python training_scripts/convert_llama_weights_to_hf.py --input_dir download_official_llama_path --model_size 7B --output_dir xx/llama-7b-hf
265 | ```
266 |
267 | 运行训练脚本时将model_name_or_path改为xx/llama-7b-hf即可
268 |
269 | #### 3.1.2 BELLE-LLaMA转为hf格式
270 |
271 | 由于LLaMA模型的使用约束,我们只能开源与原始模型的diff(如:[BELLE-LLaMA-7B-2M-enc](https://huggingface.co/BelleGroup/BELLE-LLaMA-7B-2M-enc))。当您已经从[facebookresearch/llama](https://github.com/facebookresearch/llama)获取LLaMA模型的访问权限后,可参考 https://github.com/LianjiaTech/BELLE/tree/main/models ,转换后的模型即为我们指令调优后的LLaMA模型。
272 |
273 | ## 4. 致谢
274 |
275 | 1. [Deepspeed-Chat](https://github.com/microsoft/DeepSpeedExamples)
276 |
277 | ## 5. 问题反馈
278 |
279 | 如有问题,请在GitHub Issue中提交。在提交问题前,请先查看 https://github.com/microsoft/DeepSpeedExamples/issues 中是否已出现过解决类似问题的方法。
280 |
281 | **我们的实验均在8卡A100 40G上运行,在之前的实验过程中发现在V100上运行可能会遇到问题。因此如果是在V100上运行报错,请自行查阅相关解决方案,可主要参考 [deepspeed-chat issues](https://github.com/microsoft/DeepSpeedExamples/issues)**。
282 |
283 | ## 6. FAQ
284 |
285 | 我们会持续更新FAQ,并对询问的问题进行分类。Others中给出的是我们在实验过程中遇到的一些报错的情况以及参考的解决方案
286 |
287 | - [1. 单机单卡可以训练多大参数量的模型](FAQ.md#1)
288 | - [2. 单机多卡可以训练多大参数量的模型](FAQ.md#2)
289 | - [3. 单机单卡采用LoRA可以训练多大参数量的模型](FAQ.md#3)
290 | - [4. 单机多卡采用LoRA可以训练多大参数量的模型](FAQ.md#4)
291 | - [5. 加载Llama tokenizer时存在的问题](FAQ.md#5)
292 | - [6. 加载2M的数据量需要多大的内存和多长时间](FAQ.md#6)
293 | - [7. 训练模型的生成结果非常糟糕](FAQ.md#7)
294 | - [Others](FAQ.md#Others)
295 |
296 | ## 7. 部分代码实现细节
297 |
298 | 本仓库实验代码仅对Deepspeed-Chat项目中training/step1_supervised_finetuning内的部分代码做了简单的修改。具体修改内容如下:
299 |
300 | 1. 需要在utils/data/raw_datasets.py中实现一个类,比如BelleOpenSoucreDataset,用于读取训练数据
301 | 2. 由于训练的目标是为了让模型学会回复人类指令,所以我们仅对answer文本计算loss。需要在utils/data/data_utils.py的create_dataset_split方法中修改tokenize部分,在human instruction文本部分对应的label加上-100作为mask。如果是多轮对话数据,每一轮的human instruction对应的label都会加上-100
302 |
--------------------------------------------------------------------------------
/train/docs/README_en.md:
--------------------------------------------------------------------------------
1 | # BELLE Training Code
2 |
3 | | [English](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/README_en.md) | [中文](https://github.com/LianjiaTech/BELLE/blob/main/train/README.md)
4 |
5 | ## 1. Set up the environment
6 |
7 | ### 1.1 Docker image
8 |
9 | We provide a fully runnable Docker image, with the Dockerfile located in the docker folder.
10 |
11 | Considering the difficulty of building the image, we offer an image download. You can use the following command to pull our image from DockerHub and run the code in the image environment.
12 |
13 | ```bash
14 | docker pull belleagi/belle:v1.0
15 | git clone https://github.com/LianjiaTech/BELLE.git
16 | docker run -it --runtime=nvidia --shm-size="40g" -v /path/BELLE/train:/workspace/BELLE-train -v /path/huggingface_models/:/workspace/model_name_or_path belleagi/belle:v1.0 /bin/bash
17 | ```
18 |
19 |
20 |
21 | The above command accomplishes the following:
22 |
23 | 1. Pull the docker image
24 | 2. Clone the BELLE repository
25 | 3. Mount the BELLE/train directory with the /workspace/BELLE-train directory in the Docker environment
26 | 4. Mount the huggingface_models directory with the /workspace/model_name_or_path directory in the Docker environment. The huggingface_models directory represents the save path for pre-trained language models, which stores all required pre-trained language models, such as llama-7b, bloomz-7b1-mt, etc.
27 | 5. Note: The mounted directories must be absolute paths
28 |
29 | ### 1.2 Conda
30 |
31 | If you cannot use Docker due to machine constraints or other reasons, you can also create an environment with Conda and install the required packages using pip.
32 |
33 | ```bash
34 | pip install -r requirements.txt
35 | ```
36 |
37 |
38 |
39 | However, installing DeepSpeed with pip may fail during installation or runtime. The [FAQ](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/FAQ.md) provides some tutorials on installing DeepSpeed and possible issues you may encounter.
40 |
41 | ## 2. Run
42 |
43 | ### 2.1 Download data
44 |
45 | ```bash
46 | wget https://huggingface.co/datasets/BelleGroup/school_math_0.25M/resolve/main/school_math_0.25M.json
47 | ```
48 |
49 |
50 |
51 | This dataset is for math problem-solving tasks, including the problem-solving process, with a total of about 250,000 entries. Example:
52 |
53 | | Instruction | Answer |
54 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
55 | | 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天? | 一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可: 25 ÷ 9 = 2余7 也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。 因此,一箱鸡蛋能够供小明吃2天。 |
56 | | 小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果? | 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。 2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 计算过程如下: 15 - 6 = 9 所以,小明还剩下9颗糖果。 |
57 |
58 | #### 2.1.1 Convert data format
59 |
60 | ```bash
61 | python scripts/convert_to_conv_data.py --orig_data school_math_0.25M.json --write_data school_math_0.25M_conv.json --dataset_name bellemath
62 | head -n 1000 school_math_0.25M_conv.json > belleMath-dev1K.json
63 | tail -n +1001 school_math_0.25M_conv.json > belleMath.json
64 | ```
65 |
66 |
67 |
68 | We select the first 1000 entries as the validation set, and the rest as the training set.
69 |
70 | **Note:** Currently, the repository code needs to be compatible with both single-turn and multi-turn data. Therefore, for previously open-sourced data (formatted as [instruction, output], most of the open-sourced single-turn instruction data is in this format), you need to use the convert_to_conv_data.py script to convert it to the following unified format:
71 |
72 | ```bash
73 | {"id": "bellemath42", "conversations": [{"from": "human", "value": " 一箱鸡蛋有25个,小明爸爸每天给他9个鸡蛋吃,那么一箱鸡蛋能够供小明吃几天?"}, {"from": "assistant", "value": "\n一箱鸡蛋有25个,小明每天吃9个鸡蛋,用除法计算一下即可:\n25 ÷ 9 = 2余7\n也就是说,小明第1天到第2天共吃2个鸡蛋,第3天吃剩下的7个鸡蛋。\n因此,一箱鸡蛋能够供小明吃2天。"}]}
74 | {"id": "bellemath80", "conversations": [{"from": "human", "value": "小明手中有15颗糖果,他想要分给他的朋友小红和小李,每人分得的糖果数是3颗,那么他还剩下几颗糖果?"}, {"from": "assistant", "value": " \n1. 该题可以用减法来解决。小明手中有15颗糖果,每人分得3颗,所以小红和小李一共分得6颗糖果(3+3=6)。\n2. 然后再用原来的糖果总数减去分给小红和小李的糖果数,即可得到小明手中剩余的糖果数。 \n计算过程如下:\n15 - 6 = 9\n所以,小明还剩下9颗糖果。"}]}
75 | ```
76 |
77 |
78 |
79 |
80 | Other training data can be found at: https://huggingface.co/BelleGroup. Convert the format according to the above process.
81 |
82 | For multi-turn conversation data, [shareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/tree/main) is an open-source large-scale multi-turn conversation dataset. You can refer to our work: [Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation](https://arxiv.org/pdf/2304.07854.pdf) for specific performance.
83 |
84 | The current code supports training this multi-turn conversation data. Download the data:
85 |
86 | ```bash
87 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
88 | ```
89 |
90 |
91 |
92 | ### 2.2 Model Training
93 |
94 | The training start script is located in scripts/run.sh, and you need to modify the parameters in run.sh according to your actual requirements.
95 |
96 | ```bash
97 | bash scripts/run.sh
98 | ```
99 |
100 |
101 |
102 | - model_name_or_path represents the pre-trained model (if it is a LLaMA model, it needs to be converted to hf format beforehand to be loaded with from_pretrained)
103 | - train_file represents the training data
104 | - validation_file represents the validation data
105 | - output_dir represents the path for saving training logs and models
106 | - cache_dir represents the path for caching data processing
107 | - cutoff_len represents the maximum input sequence length (for LLaMA models, it is recommended to set it to 1024 or above; for Bloom models, set it to 512 or above)
108 |
109 | The run.sh script contains both full parameter fine-tuning and LoRA training start commands. Here, we will briefly explain the meaning of each parameter in the start command.
110 |
111 | #### 2.2.1 Full parameter fine-tuning
112 |
113 | The following command is for full-parameter fine-tuning on a single machine with multiple cards, using deepspeed and the base model LLaMA:
114 |
115 | ```bash
116 | torchrun --nproc_per_node 8 train.py \
117 | --model_name_or_path ${model_name_or_path} \
118 | --llama \
119 | --deepspeed configs/deepspeed_config.json \
120 | --train_file ${train_file} \
121 | --validation_file ${validation_file} \
122 | --per_device_train_batch_size 2 \
123 | --per_device_eval_batch_size 2 \
124 | --gradient_accumulation_steps 4 \
125 | --num_train_epochs 2 \
126 | --model_max_length ${cutoff_len} \
127 | --save_strategy "steps" \
128 | --save_total_limit 3 \
129 | --learning_rate 8e-6 \
130 | --weight_decay 0.00001 \
131 | --warmup_ratio 0.05 \
132 | --lr_scheduler_type "cosine" \
133 | --logging_steps 10 \
134 | --evaluation_strategy "steps" \
135 | --fp16 True \
136 | --seed 1234 \
137 | --gradient_checkpointing True \
138 | --cache_dir ${cache_dir} \
139 | --output_dir ${output_dir}
140 | ```
141 |
142 |
143 |
144 | **Parameter Description**
145 |
146 | 1. If you want to train with a single card, just set nproc_per_node to 1.
147 | 2. If the pre-trained model is not LLaMA, remove --llama. If it is a LLaMA model, you need to specify --llama. This is because LLaMA models need to use LLamaTokenizer for loading, and using AutoTokenizer for llama may cause infinite recursion problems, which is related to the transformers version.
148 | 3. If the runtime environment does not support deepspeed, remove --deepspeed.
149 |
150 | Deepspeed parameter configuration can refer to:
151 |
152 | 1. https://www.deepspeed.ai/docs/config-json/
153 | 2. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
154 |
155 | Training logs and models are saved in the output_dir directory, and the file structure in the directory should be as follows:
156 |
157 | ```Arduino
158 | output_dir/
159 | ├── checkpoint-244/
160 | │ ├── pytorch_model.bin
161 | │ ├── config.json
162 | │ └── trainer_state.json
163 | ├── checkpoint-527/
164 | │ ├── pytorch_model.bin
165 | │ ├── config.json
166 | │ └── trainer_state.json
167 | ├── pytorch_model.bin
168 | ├── print_log.txt
169 | └── config.json
170 | ```
171 |
172 |
173 |
174 | trainer_state.json records the changes in loss and learning_rate.
175 |
176 | #### 2.2.2 LoRA
177 |
178 | ```bash
179 | torchrun --nproc_per_node 8 train.py \
180 | --model_name_or_path ${model_name_or_path} \
181 | --llama \
182 | --use_lora True \
183 | --use_int8_training \
184 | --lora_config configs/lora_config_llama.json \
185 | --train_file ${train_file} \
186 | --validation_file ${validation_file} \
187 | --per_device_train_batch_size 2 \
188 | --per_device_eval_batch_size 2 \
189 | --gradient_accumulation_steps 4 \
190 | --num_train_epochs 2 \
191 | --model_max_length ${cutoff_len} \
192 | --save_strategy "steps" \
193 | --save_total_limit 3 \
194 | --learning_rate 8e-6 \
195 | --weight_decay 0.00001 \
196 | --warmup_ratio 0.05 \
197 | --lr_scheduler_type "cosine" \
198 | --logging_steps 10 \
199 | --evaluation_strategy "steps" \
200 | --fp16 True \
201 | --seed 1234 \
202 | --gradient_checkpointing True \
203 | --cache_dir ${cache_dir} \
204 | --output_dir ${output_dir}
205 | ```
206 |
207 |
208 |
209 | **Parameter Description**
210 |
211 | - use_lora represents using LoRA for training.
212 | - use_int8_training represents using 8-bit quantization training, which can significantly reduce GPU memory usage.
213 | - lora_config provides the parameter configuration for LoRA. If training the Bloom model, change it to configs/lora_config_bloom.json.
214 |
215 | The file structure of the output_dir directory is as follows:
216 |
217 | ```Arduino
218 | output_dir/
219 | ├── checkpoint-244/
220 | │ ├── pytorch_model.bin
221 | │ └── trainer_state.json
222 | ├── checkpoint-527/
223 | │ ├── pytorch_model.bin
224 | │ └── trainer_state.json
225 | ├── adapter_model.bin
226 | ├── print_log.txt
227 | └── adapter_config.json
228 | ```
229 |
230 |
231 |
232 | **Note: The saved model adapter_model.bin after LoRA training may be an empty file. In this case, you need to copy the pytorch_model.bin saved in other checkpoint-step to the output_dir directory. For example:**
233 |
234 | ```bash
235 | cd output_dir
236 | cp checkpoint-527/pytorch_model.bin adapter_model.bin
237 | ```
238 |
239 |
240 |
241 | **Make sure adapter_model.bin is a valid LoRA weight.**
242 |
243 | #### 2.2.3 Merging LoRA Weights
244 |
245 | If you want to merge the LoRA weights with the pre-trained model, you can run the following command:
246 |
247 | ```bash
248 | bash scripts/merge_lora.sh
249 | ```
250 |
251 | The merged weights are saved in the output_path directory and can be loaded directly using from_pretrained later.
252 |
253 | ## 3. Inference
254 |
255 | ### 3.1 Inference
256 |
257 | If you see this section, it means you have completed the training. Now we load the trained model to verify the text generation effect.
258 |
259 | ```bash
260 | CUDA_VISIBLE_DEVICES=0 python src/inference.py \
261 | --model_name_or_path model_name_or_path \
262 | --ckpt_path ckpt_path \
263 | --llama \
264 | --use_lora
265 | ```
266 |
267 |
268 |
269 | **Parameter Description:**
270 |
271 | - model_name_or_path is the path to the native pre-trained model.
272 | - ckpt_path is the path to the saved model after training, which is the output_dir.
273 | - llama indicates whether the base model is a LLaMA model.
274 | - use_lora indicates whether the ckpt_path is LoRA weight.
275 |
276 | **Note: The saved model adapter_model.bin after LoRA training may be an empty file. In this case, you need to copy the pytorch_model.bin saved in other checkpoint-step to the output_dir directory.**
277 |
278 | In addition, if you have already merged the LoRA weights with the pre-trained model, you only need to specify ckpt_path as the path where the merged weights are saved, and there is no need to specify use_lora.
279 |
280 | ### 3.2 WebUI
281 |
282 | We also provide a simple interactive web interface based on gradio. To start the service:
283 |
284 | ```bash
285 | CUDA_VISIBLE_DEVICES=0 python src/interface.py \
286 | --model_name_or_path model_name_or_path \
287 | --ckpt_path ckpt_path \
288 | --llama \
289 | --use_lora
290 | ```
291 |
292 |
293 |
294 | The service access address is hostip:17860.
295 |
296 | 
297 |
298 | ## 4. Additional Notes
299 |
300 | ### 4.1 Usage of LLaMA Model
301 |
302 | #### 4.1.1 Converting official LLaMA weights from Facebook to Hugging Face format
303 |
304 | First, you need to obtain access to the LLaMA model from [facebookresearch/llama](https://github.com/facebookresearch/llama) and download the official checkpoints.
305 |
306 | ```bash
307 | python training_scripts/convert_llama_weights_to_hf.py --input_dir download_official_llama_path --model_size 7B --output_dir xx/llama-7b-hf
308 | ```
309 |
310 |
311 |
312 | When running the training script, change the `model_name_or_path` to `xx/llama-7b-hf`.
313 |
314 | #### 4.1.2 Converting BELLE-LLaMA to Hugging Face format
315 |
316 | Due to the usage constraints of the LLaMA model, we can only open-source the diff from the original model, such as [BELLE-LLaMA-7B-2M-enc](https://huggingface.co/BelleGroup/BELLE-LLaMA-7B-2M-enc). Once you have obtained access to the LLaMA model from [facebookresearch/llama](https://github.com/facebookresearch/llama), you can refer to https://github.com/LianjiaTech/BELLE/tree/main/models to convert the model. The converted model will be our instruction-optimized LLaMA model.
317 |
318 | ## 5. Feedback and Questions
319 |
320 | If you have any questions, please submit them in GitHub Issues. Before encountering a problem, please search for a solution to a similar issue in the [FAQ](https://github.com/LianjiaTech/BELLE/blob/main/train/docs/FAQ.md) .
321 |
322 | The above text content is in a Chinese Markdown file. Please generate the corresponding English version of the Markdown file. The contents of tables and code cells do not need to be translated and do not need to be output.
--------------------------------------------------------------------------------
/train/src/entry_point/evaluation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import sys\n",
10 | "sys.path.append('path/of/BELLE/train/src') # revise\n",
11 | "# sys.path.append('path/of/pixiu_private-main/train/src')"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stdout",
21 | "output_type": "stream",
22 | "text": [
23 | "[2023-06-26 14:59:13,975] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "from utils import MultiClient\n",
29 | "ip = '...'\n",
30 | "base_port = 17860\n",
31 | "worker_addrs = [\n",
32 | " f\"http://{ip}:{base_port + i}\" for i in range(8)\n",
33 | "]"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "client = MultiClient(worker_addrs)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "GENERATION_CONFIG = [\n",
52 | " # int | float (numeric value between 0 and 1) in 'Temperature' Slider component\n",
53 | " 0.01,\n",
54 | " # int | float (numeric value between 0 and 1) in 'Top p' Slider component\n",
55 | " 0.85,\n",
56 | " # int | float (numeric value between 0 and 100) in 'Top k' Slider component\n",
57 | " 30,\n",
58 | " # int | float (numeric value between 1 and 4) in 'Beams Number' Slider component\n",
59 | " 1,\n",
60 | " # do sample\n",
61 | " True,\n",
62 | " # int | float (numeric value between 1 and 2000) in 'Max New Tokens' Slider component\n",
63 | " 1024,\n",
64 | " # int | float (numeric value between 1 and 300) in 'Min New Tokens' Slider component\n",
65 | " 1,\n",
66 | " # int | float (numeric value between 1.0 and 2.0) in 'Repetition Penalty' Slider component\n",
67 | " 1.2,\n",
68 | "]"
69 | ]
70 | },
71 | {
72 | "attachments": {},
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "# 基础能力测试"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 5,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "import pandas as pd\n",
86 | "eval_df = pd.read_json('/nfs/a100-006/hanweiguang/Projects/automatic_evaluate/eval_data.jsonl', lines=True)\n",
87 | "data = eval_df['instruction'].tolist()"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 6,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# BELLE template\n",
97 | "# template = \\\n",
98 | "# \"\"\"\"Human: \n",
99 | "# {text}\n",
100 | "\n",
101 | "# Assistant: \n",
102 | "# \"\"\"\n",
103 | "\n",
104 | "# glm template\n",
105 | "template = \\\n",
106 | "\"\"\"{text}\"\"\""
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 7,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stderr",
116 | "output_type": "stream",
117 | "text": [
118 | "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [17:22<00:00, 1.04s/it]\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "tasks = []\n",
124 | "for i in range(len(data)):\n",
125 | " tasks.append([template.format(text=data[i])] + GENERATION_CONFIG)\n",
126 | "answers = client.predict(tasks)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 9,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/plain": [
137 | "1000"
138 | ]
139 | },
140 | "execution_count": 9,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "len(answers)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 8,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/html": [
157 | "\n",
158 | "\n",
171 | "
\n",
172 | " \n",
173 | " \n",
174 | " | \n",
175 | " id | \n",
176 | " instruction | \n",
177 | " gold_response | \n",
178 | " type | \n",
179 | " response | \n",
180 | "
\n",
181 | " \n",
182 | " \n",
183 | " \n",
184 | " | 0 | \n",
185 | " 1 | \n",
186 | " 请你写出一个可以生成五个不同随机数的 Python 程序。\\n | \n",
187 | " 要生成五个不同的随机数,可以使用 Python 的 random 模块中的 randint(... | \n",
188 | " code | \n",
189 | " 以下是使用 Python 的 random 模块生成五个不同随机整数的示例代码:\\n\\n``... | \n",
190 | "
\n",
191 | " \n",
192 | " | 1 | \n",
193 | " 2 | \n",
194 | " 回答以下问题的答案。\\n中国有多少个省? | \n",
195 | " 中国目前有34个省级行政区。具体分为23个省、5个自治区、4个直辖市和2个特别行政区。以下是... | \n",
196 | " open qa | \n",
197 | " 中国有34个省级行政区,包括23个省、5个自治区、4个直辖市和2个特别行政区(香港和澳门)。 | \n",
198 | "
\n",
199 | " \n",
200 | " | 2 | \n",
201 | " 3 | \n",
202 | " 什么是机器学习?它有哪些应用场景?\\n | \n",
203 | " 机器学习是一种人工智能科学的分支,其目的是使计算机系统能够自动学习和改进,而不需要严格的编程... | \n",
204 | " open qa | \n",
205 | " 机器学习是一种人工智能领域的技术,旨在使计算机程序从数据中学习和提取模式,并利用这些知识做出... | \n",
206 | "
\n",
207 | " \n",
208 | " | 3 | \n",
209 | " 4 | \n",
210 | " 提供一个主题,让学生进行头脑风暴,产生关于该主题的想法。\\n如何提高城市化水平。 | \n",
211 | " 如何提高城市化水平?\\n头脑风暴产生的想法:\\n1. 投资基础设施:城市化的基础是基础设施。... | \n",
212 | " brainstorming | \n",
213 | " 以下是一个主题,可以激发学生的头脑风暴:\\n\\n\"创新城市化解决方案——促进可持续发展和提高... | \n",
214 | "
\n",
215 | " \n",
216 | " | 4 | \n",
217 | " 5 | \n",
218 | " 针对给定的主题,进行头脑风暴并记录所有想法。\\n如何提高公司的销售额? | \n",
219 | " 提高公司的销售额是一个涉及到多种因素的问题。下面是一些可能有用的头脑风暴提示:\\n1. 提供... | \n",
220 | " brainstorming | \n",
221 | " 以下是一些可能有助于提高公司销售额的想法:\\n\\n1. 打造一个独特的品牌形象,包括标志、标... | \n",
222 | "
\n",
223 | " \n",
224 | "
\n",
225 | "
"
226 | ],
227 | "text/plain": [
228 | " id instruction \\\n",
229 | "0 1 请你写出一个可以生成五个不同随机数的 Python 程序。\\n \n",
230 | "1 2 回答以下问题的答案。\\n中国有多少个省? \n",
231 | "2 3 什么是机器学习?它有哪些应用场景?\\n \n",
232 | "3 4 提供一个主题,让学生进行头脑风暴,产生关于该主题的想法。\\n如何提高城市化水平。 \n",
233 | "4 5 针对给定的主题,进行头脑风暴并记录所有想法。\\n如何提高公司的销售额? \n",
234 | "\n",
235 | " gold_response type \\\n",
236 | "0 要生成五个不同的随机数,可以使用 Python 的 random 模块中的 randint(... code \n",
237 | "1 中国目前有34个省级行政区。具体分为23个省、5个自治区、4个直辖市和2个特别行政区。以下是... open qa \n",
238 | "2 机器学习是一种人工智能科学的分支,其目的是使计算机系统能够自动学习和改进,而不需要严格的编程... open qa \n",
239 | "3 如何提高城市化水平?\\n头脑风暴产生的想法:\\n1. 投资基础设施:城市化的基础是基础设施。... brainstorming \n",
240 | "4 提高公司的销售额是一个涉及到多种因素的问题。下面是一些可能有用的头脑风暴提示:\\n1. 提供... brainstorming \n",
241 | "\n",
242 | " response \n",
243 | "0 以下是使用 Python 的 random 模块生成五个不同随机整数的示例代码:\\n\\n``... \n",
244 | "1 中国有34个省级行政区,包括23个省、5个自治区、4个直辖市和2个特别行政区(香港和澳门)。 \n",
245 | "2 机器学习是一种人工智能领域的技术,旨在使计算机程序从数据中学习和提取模式,并利用这些知识做出... \n",
246 | "3 以下是一个主题,可以激发学生的头脑风暴:\\n\\n\"创新城市化解决方案——促进可持续发展和提高... \n",
247 | "4 以下是一些可能有助于提高公司销售额的想法:\\n\\n1. 打造一个独特的品牌形象,包括标志、标... "
248 | ]
249 | },
250 | "execution_count": 8,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "eval_df['response'] = answers\n",
257 | "eval_df[:5]"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 10,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "eval_df.to_json('/nfs/a100-006/hanweiguang/Projects/automatic_evaluate/data/chatglm2-6b.json', lines=True, orient='records', force_ascii=False)"
267 | ]
268 | },
269 | {
270 | "attachments": {},
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "# 博学测试"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "import json\n",
284 | "with open(\"/nfs/a100-006/hanweiguang/Projects/BELLE/data/boxue/exam_1.jsonl\") as f:\n",
285 | " data = f.readlines()\n",
286 | " data = [json.loads(val) for val in data]"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "template = \\\n",
296 | "\"\"\"Human: {type}\n",
297 | "{question}\n",
298 | "{candidates}\n",
299 | "\n",
300 | "Assistant: \n",
301 | "\"\"\""
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "tasks = []\n",
311 | "for i in range(len(data)):\n",
312 | " sample = data[i]\n",
313 | " tasks.append([template.format(\n",
314 | " question=sample['question'].strip(),\n",
315 | " candidates='\\n'.join(sample['candidates']),\n",
316 | " type=sample['type']\n",
317 | " )] + GENERATION_CONFIG)"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "answers = client.predict(tasks)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "import pandas as pd\n",
336 | "preds = pd.DataFrame({\"prompt\": [task[0] for task in tasks], \"pred\": answers, \"labels\": [sample[\"answer\"] for sample in data]})\n",
337 | "preds.to_excel('../../data/boxue/pred_1.xlsx')"
338 | ]
339 | },
340 | {
341 | "attachments": {},
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "# GLM DEBUG"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 14,
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "name": "stderr",
355 | "output_type": "stream",
356 | "text": [
357 | "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00, 1.65s/it]\n"
358 | ]
359 | }
360 | ],
361 | "source": [
362 | "from transformers import AutoTokenizer, AutoModel\n",
363 | "tokenizer = AutoTokenizer.from_pretrained(\"/nfs/v100-022/xytian/chatglm2-6b\", trust_remote_code=True)\n",
364 | "model = AutoModel.from_pretrained('/nfs/v100-022/xytian/chatglm2-6b', trust_remote_code=True)\n",
365 | "model = model.half().cuda().eval()"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 15,
371 | "metadata": {},
372 | "outputs": [
373 | {
374 | "data": {
375 | "text/plain": [
376 | "(None, None, 2)"
377 | ]
378 | },
379 | "execution_count": 15,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "tokenizer.eos_token_id, tokenizer.bos_token_id, tokenizer.pad_token_id"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 18,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "name": "stderr",
395 | "output_type": "stream",
396 | "text": [
397 | "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
398 | ]
399 | },
400 | {
401 | "data": {
402 | "text/plain": [
403 | "('你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。',\n",
404 | " [('你好', '你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。')])"
405 | ]
406 | },
407 | "execution_count": 18,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "response, history = model.chat(tokenizer, \"你好\", history=[])\n",
414 | "response, history"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 19,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "from transformers import GenerationConfig"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {},
430 | "outputs": [],
431 | "source": [
432 | "generation_config = GenerationConfig(\n",
433 | " temperature=0.01,\n",
434 | " top_p=0.85,\n",
435 | " top_k=30,\n",
436 | " num_beams=1,\n",
437 | " max_new_tokens=1024, # max_length=max_new_tokens+input_sequence\n",
438 | " min_new_tokens=1, # min_length=min_new_tokens+input_sequence\n",
439 | " repetition_penalty=1.2,\n",
440 | " do_sample=True,\n",
441 | " pad_token_id = tokenizer.pad_token_id\n",
442 | ")\n",
443 | "model.chat(tokenizer, \"你好\", **generation_config.to_dict())"
444 | ]
445 | }
446 | ],
447 | "metadata": {
448 | "kernelspec": {
449 | "display_name": "Python 3 (ipykernel)",
450 | "language": "python",
451 | "name": "python3"
452 | },
453 | "orig_nbformat": 4
454 | },
455 | "nbformat": 4,
456 | "nbformat_minor": 2
457 | }
--------------------------------------------------------------------------------