├── .gitignore ├── infer ├── __init__.py ├── infer.py ├── infer_finetuning.py ├── infer_ptuning.py ├── infer_lora_finetuning.py ├── infer_lora_finetuning_loop.py └── infer_muti_lora_finetuning.py ├── config ├── __init__.py ├── deepspeed.yaml ├── deepspeed_offload.yaml ├── global.yaml ├── train_ac.yaml ├── train_hf.yaml ├── train_cl.yaml ├── train_pl.yaml ├── petl.yaml ├── colossalai_strategy.yaml └── main.py ├── training ├── __init__.py ├── train_cl.py ├── train_pl.py ├── train_ac.py └── train_hf.py ├── requirements.txt ├── train.py ├── data ├── make_data_example.py └── finetune_train_examples.json ├── scripts ├── train_lora.sh ├── train_ptv2.sh ├── train_full.sh ├── train_lora_int4.sh └── train_lora_int8.sh ├── args.MD ├── README.md ├── data_processer.py └── data_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | -------------------------------------------------------------------------------- /infer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/10/12 17:07 4 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/5/24 15:50 3 | 4 | from config.main import * -------------------------------------------------------------------------------- /training/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/10/12 17:05 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.30 2 | deepspeed 3 | bitsandbytes>=0.39 4 | accelerate>=0.20 5 | # aigc-zoo>=0.1.11,<=0.1.12 6 | git+https://github.com/ssbuild/deep_training#egg=deep_training -------------------------------------------------------------------------------- /config/deepspeed.yaml: -------------------------------------------------------------------------------- 1 | "zero_allow_untested_optimizer": true, 2 | "fp16": 3 | "enabled": true, 4 | "auto_cast": false, 5 | "loss_scale": 0, 6 | "initial_scale_power": 16, 7 | "loss_scale_window": 1000, 8 | "hysteresis": 2, 9 | "min_loss_scale": 1 10 | 11 | "zero_optimization": 12 | "stage": 2, 13 | "allgather_partitions": true, 14 | "allgather_bucket_size": 5e8, 15 | "overlap_comm": false, 16 | "reduce_scatter": true, 17 | "reduce_bucket_size": 5e8, 18 | "contiguous_gradients" : true, 19 | 20 | "stage3_max_live_parameters": 1e9, 21 | "stage3_max_reuse_distance" : 1e9, 22 | "stage3_prefetch_bucket_size" : 5e8, 23 | "stage3_param_persistence_threshold" : 1e6, 24 | "sub_group_size" : 1e12, 25 | "elastic_checkpoint" : true, 26 | "stage3_gather_16bit_weights_on_model_save": true, 27 | "ignore_unused_parameters": true, 28 | "round_robin_gradients": true 29 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/10/12 10:50 4 | 5 | import os 6 | from config import global_args 7 | 8 | def main(): 9 | trainer_backend = global_args["trainer_backend"] 10 | if trainer_backend == "pl": 11 | from training.train_pl import main as main_execute 12 | elif trainer_backend == "hf": 13 | from training.train_hf import main as main_execute 14 | elif trainer_backend == "cl": 15 | from training.train_cl import main as main_execute 16 | elif trainer_backend == "ac": 17 | from training.train_ac import main as main_execute 18 | else: 19 | raise ValueError(f"{trainer_backend} NotImplemented ") 20 | 21 | main_execute() 22 | 23 | def _mp_fn(index): 24 | # For xla_spawn (TPUs) 25 | main() 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /config/deepspeed_offload.yaml: -------------------------------------------------------------------------------- 1 | 2 | "optimizer": 3 | "type": "AdamW" 4 | "params": 5 | "lr": 2e-5 6 | "betas": [0.9, 0.999] 7 | "eps": 1e-8 8 | "weight_decay": 0 9 | 10 | "zero_allow_untested_optimizer": true 11 | "fp16": 12 | "enabled": true 13 | "auto_cast": false 14 | "loss_scale": 0 15 | "initial_scale_power": 16 16 | "loss_scale_window": 1000 17 | "hysteresis": 2 18 | "min_loss_scale": 1 19 | "zero_optimization": 20 | "stage": 2 21 | "allgather_partitions": true 22 | "allgather_bucket_size": 5e8 23 | "overlap_comm": false 24 | "reduce_scatter": true 25 | "reduce_bucket_size": 5e8 26 | "contiguous_gradients": true 27 | "stage3_max_live_parameters": 1e9 28 | "stage3_max_reuse_distance": 1e9 29 | "stage3_prefetch_bucket_size": 5e8 30 | "stage3_param_persistence_threshold": 1e6 31 | "sub_group_size": 1e12 32 | "elastic_checkpoint": true 33 | "stage3_gather_16bit_weights_on_model_save": true 34 | "ignore_unused_parameters": true 35 | "round_robin_gradients": true 36 | "offload_optimizer": 37 | "device": "cpu" 38 | "pin_memory": true 39 | -------------------------------------------------------------------------------- /data/make_data_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/2/24 12:50 3 | 4 | 5 | import json 6 | 7 | 8 | x1 = { 9 | "id": 0, "paragraph": [ 10 | { 11 | "q": "从南京到上海的路线", 12 | "a": [ 13 | "你好,南京到上海的路线如下:", 14 | "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", 15 | "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", 16 | "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站" 17 | ] 18 | } 19 | ] 20 | } 21 | 22 | x2 = {"id": 0, "paragraph": [ 23 | 24 | { 25 | "q": "写一个诗歌,关于冬天", 26 | "a": [ 27 | "冬夜寂静冷,", 28 | "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", 29 | " ", 30 | "雪花融化成冰,", 31 | "像那雪花飘洒,", 32 | "在寒冷的冬天,", 33 | "感受春天的喜悦。", 34 | " 冬日里,", 35 | "风雪渐消,", 36 | "一片寂静,", 37 | "把快乐和温暖带回家。" 38 | ] 39 | } 40 | ] 41 | } 42 | 43 | x = [x1,x2] 44 | 45 | with open('./data/finetune_train_examples.json',mode='w',encoding='utf-8',newline='\n') as f: 46 | index = 0 47 | for i in range(100): 48 | for j in range(len(x)): 49 | index += 1 50 | x[j]['id'] = index 51 | f.write(json.dumps(x[j],ensure_ascii=False) + '\n' ) 52 | -------------------------------------------------------------------------------- /config/global.yaml: -------------------------------------------------------------------------------- 1 | global_args: 2 | trainer_backend: pl 3 | enable_deepspeed: false 4 | enable_ptv2: false 5 | enable_lora: true 6 | load_in_bit: 0 7 | config_merge: {} 8 | # 模型权重 , 对应 configconstant_mappy 9 | model_name: ChatYuan-large-v2 10 | 11 | # one of auto 16 bf16 32 12 | precision: 32 13 | quantization_config: 14 | load_in_8bit: false 15 | load_in_4bit: false 16 | llm_int8_threshold: 6.0 17 | llm_int8_has_fp16_weight: false 18 | bnb_4bit_compute_dtype: float16 # one of float16 bfloat16 float32 19 | bnb_4bit_use_double_quant: true 20 | bnb_4bit_quant_type: nf4 21 | 22 | 23 | 24 | global_models_mapper: 25 | ChatYuan-large-v2: 26 | model_type: t5 27 | model_name_or_path: /data/nlp/pre_models/torch/t5/ChatYuan-large-v2 28 | tokenizer_name: /data/nlp/pre_models/torch/t5/ChatYuan-large-v2 29 | config_name: /data/nlp/pre_models/torch/t5/ChatYuan-large-v2/config.json 30 | 31 | ChatYuan-large-v1: 32 | model_type: t5 33 | model_name_or_path: /data/nlp/pre_models/torch/t5/ChatYuan-large-v1 34 | tokenizer_name: /data/nlp/pre_models/torch/t5/ChatYuan-large-v1 35 | config_name: /data/nlp/pre_models/torch/t5/ChatYuan-large-v1/config.json 36 | 37 | 38 | PromptCLUE-base-v1-5: 39 | model_type: t5 40 | model_name_or_path: /data/nlp/pre_models/torch/t5/PromptCLUE-base-v1-5 41 | tokenizer_name: /data/nlp/pre_models/torch/t5/PromptCLUE-base-v1-5 42 | config_name: /data/nlp/pre_models/torch/t5/PromptCLUE-base-v1-5/config.json 43 | 44 | 45 | -------------------------------------------------------------------------------- /config/train_ac.yaml: -------------------------------------------------------------------------------- 1 | includes: [global.yaml, petl.yaml] 2 | 3 | # one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大 4 | data_backend: parquet 5 | 6 | output_dir: ./outputs_ac 7 | overwrite_output_dir: true 8 | num_train_epochs: 20 9 | max_steps: -1 10 | save_safetensors: false 11 | save_strategy: steps 12 | save_steps: 1000 13 | save_total_limit: 10 14 | seed: 42 15 | fp16: true 16 | do_train: true 17 | train_file: 18 | - ../data/*.json 19 | 20 | do_eval: false 21 | do_predict: false 22 | per_device_train_batch_size: 2 23 | per_device_eval_batch_size: 2 24 | gradient_accumulation_steps: 1 25 | evaluation_strategy: 'no' 26 | eval_steps: 100 27 | 28 | # adamw_hf , adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused, 29 | # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit, 30 | # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit, 31 | # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp 32 | 33 | optim: adamw_torch 34 | 35 | # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau 36 | lr_scheduler_type: cosine 37 | torch_compile: false 38 | learning_rate: 2.0e-05 39 | adam_beta1: 0.9 40 | adam_beta2: 0.999 41 | adam_epsilon: 1.0e-08 42 | max_grad_norm: 1.0 43 | weight_decay: 0.0 44 | warmup_ratio: 0.03 45 | logging_strategy: steps 46 | logging_steps: 10 47 | tf32: false 48 | gradient_checkpointing: false 49 | max_seq_length: 512 50 | max_target_length: 100 51 | do_lower_case: null 52 | 53 | use_fast_tokenizer: false 54 | dataloader_drop_last: true 55 | dataloader_pin_memory: true 56 | dataloader_num_workers: 0 57 | log_level: info 58 | -------------------------------------------------------------------------------- /config/train_hf.yaml: -------------------------------------------------------------------------------- 1 | includes: [global.yaml, petl.yaml] 2 | 3 | # one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大 4 | data_backend: parquet 5 | output_dir: ./outputs_hf 6 | overwrite_output_dir: true 7 | num_train_epochs: 20 8 | max_steps: -1 9 | save_safetensors: false 10 | save_strategy: steps 11 | save_steps: 1000 12 | save_total_limit: 10 13 | seed: 42 14 | fp16: true 15 | do_train: true 16 | train_file: 17 | - ../data/*.json 18 | do_eval: false 19 | do_predict: false 20 | per_device_train_batch_size: 2 21 | per_device_eval_batch_size: 2 22 | gradient_accumulation_steps: 1 23 | evaluation_strategy: 'no' 24 | eval_steps: 100 25 | 26 | 27 | # adamw_hf , adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused, 28 | # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit, 29 | # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit, 30 | # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp 31 | 32 | optim: adamw_torch 33 | 34 | # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau 35 | lr_scheduler_type: cosine 36 | torch_compile: false 37 | learning_rate: 2.0e-05 38 | adam_beta1: 0.9 39 | adam_beta2: 0.999 40 | adam_epsilon: 1.0e-08 41 | max_grad_norm: 1.0 42 | weight_decay: 0.0 43 | warmup_ratio: 0.03 44 | logging_strategy: steps 45 | logging_steps: 10 46 | tf32: false 47 | gradient_checkpointing: false 48 | max_seq_length: 512 49 | max_target_length: 100 50 | 51 | do_lower_case: null 52 | use_fast_tokenizer: false 53 | dataloader_drop_last: true 54 | dataloader_pin_memory: true 55 | dataloader_num_workers: 0 56 | log_level: info 57 | 58 | -------------------------------------------------------------------------------- /scripts/train_lora.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export trainer_backend=pl 4 | 5 | train_file="../config/train_${trainer_backend}.yaml" 6 | 7 | # 强制覆盖配置文件 8 | export train_file=${train_file} 9 | export enable_deepspeed=false 10 | export enable_ptv2=false 11 | export enable_lora=true 12 | export load_in_bit=0 13 | 14 | #export CUDA_VISIBLE_DEVICES="0,1,2,3" 15 | 16 | usage() { echo "Usage: $0 [-m ]" 1>&2; exit 1; } 17 | 18 | 19 | while getopts m: opt 20 | do 21 | case "${opt}" in 22 | m) mode=${OPTARG};; 23 | *) 24 | usage 25 | ;; 26 | esac 27 | done 28 | 29 | if [ "${mode}" != "dataset" ] && [ "${mode}" != "train" ] ; then 30 | usage 31 | fi 32 | 33 | if [[ "${mode}" == "dataset" ]] ; then 34 | python ../data_utils.py 35 | exit 0 36 | fi 37 | 38 | if [[ "${trainer_backend}" == "pl" ]] ; then 39 | # pl 多卡 修改配置文件 devices 40 | 41 | ### 多机多卡训练 42 | 43 | # 例子 3个机器 每个机器 4个卡 44 | # 修改train.py Trainer num_nodes = 3 45 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=0 python train.py 46 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=1 python train.py 47 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=2 python train.py 48 | 49 | # pl 多卡 修改配置文件 devices 50 | 51 | python ../train.py 52 | elif [[ "${trainer_backend}" == "cl" ]] ; then 53 | # 多机多卡 54 | # colossalai run --nproc_per_node 1 --num_nodes 1 --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 55 | 56 | colossalai run --nproc_per_node 1 --num_nodes 1 ../train.py 57 | else 58 | # 多机多卡 59 | # --nproc_per_node=1 nnodes=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 60 | torchrun --nproc_per_node 1 --nnodes 1 ../train.py 61 | fi -------------------------------------------------------------------------------- /scripts/train_ptv2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export trainer_backend=pl 4 | 5 | train_file="../config/train_${trainer_backend}.yaml" 6 | 7 | # 强制覆盖配置文件 8 | export train_file=${train_file} 9 | export enable_deepspeed=false 10 | export enable_ptv2=true 11 | export enable_lora=false 12 | export load_in_bit=0 13 | 14 | #export CUDA_VISIBLE_DEVICES="0,1,2,3" 15 | 16 | usage() { echo "Usage: $0 [-m ]" 1>&2; exit 1; } 17 | 18 | 19 | while getopts m: opt 20 | do 21 | case "${opt}" in 22 | m) mode=${OPTARG};; 23 | *) 24 | usage 25 | ;; 26 | esac 27 | done 28 | 29 | if [ "${mode}" != "dataset" ] && [ "${mode}" != "train" ] ; then 30 | usage 31 | fi 32 | 33 | 34 | if [[ "${mode}" == "dataset" ]] ; then 35 | python ../data_utils.py 36 | exit 0 37 | fi 38 | 39 | if [[ "${trainer_backend}" == "pl" ]] ; then 40 | # pl 多卡 修改配置文件 devices 41 | 42 | ### 多机多卡训练 43 | 44 | # 例子 3个机器 每个机器 4个卡 45 | # 修改train.py Trainer num_nodes = 3 46 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=0 python train.py 47 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=1 python train.py 48 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=2 python train.py 49 | 50 | # pl 多卡 修改配置文件 devices 51 | 52 | python ../train.py 53 | elif [[ "${trainer_backend}" == "cl" ]] ; then 54 | # 多机多卡 55 | # colossalai run --nproc_per_node 1 --num_nodes 1 --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 56 | 57 | colossalai run --nproc_per_node 1 --num_nodes 1 ../train.py 58 | else 59 | # 多机多卡 60 | # --nproc_per_node=1 nnodes=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 61 | torchrun --nproc_per_node 1 --nnodes 1 ../train.py 62 | fi -------------------------------------------------------------------------------- /scripts/train_full.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export trainer_backend=pl 4 | 5 | train_file="../config/train_${trainer_backend}.yaml" 6 | 7 | # 强制覆盖配置文件 8 | export train_file=${train_file} 9 | export enable_deepspeed=false 10 | export enable_ptv2=false 11 | export enable_lora=false 12 | export load_in_bit=0 13 | 14 | #export CUDA_VISIBLE_DEVICES="0,1,2,3" 15 | 16 | 17 | 18 | usage() { echo "Usage: $0 [-m ]" 1>&2; exit 1; } 19 | 20 | 21 | while getopts m: opt 22 | do 23 | case "${opt}" in 24 | m) mode=${OPTARG};; 25 | *) 26 | usage 27 | ;; 28 | esac 29 | done 30 | 31 | if [ "${mode}" != "dataset" ] && [ "${mode}" != "train" ] ; then 32 | usage 33 | fi 34 | 35 | if [[ "${mode}" == "dataset" ]] ; then 36 | python ../data_utils.py 37 | exit 0 38 | fi 39 | 40 | if [[ "${trainer_backend}" == "pl" ]] ; then 41 | # pl 多卡 修改配置文件 devices 42 | 43 | ### 多机多卡训练 44 | 45 | # 例子 3个机器 每个机器 4个卡 46 | # 修改train.py Trainer num_nodes = 3 47 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=0 python train.py 48 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=1 python train.py 49 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=2 python train.py 50 | 51 | # pl 多卡 修改配置文件 devices 52 | 53 | python ../train.py 54 | elif [[ "${trainer_backend}" == "cl" ]] ; then 55 | # 多机多卡 56 | # colossalai run --nproc_per_node 1 --num_nodes 1 --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 57 | 58 | colossalai run --nproc_per_node 1 --num_nodes 1 ../train.py 59 | else 60 | # 多机多卡 61 | # --nproc_per_node=1 nnodes=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 62 | torchrun --nproc_per_node 1 --nnodes 1 ../train.py 63 | fi -------------------------------------------------------------------------------- /scripts/train_lora_int4.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export trainer_backend=pl 4 | 5 | train_file="../config/train_${trainer_backend}.yaml" 6 | 7 | # 强制覆盖配置文件 8 | export train_file=${train_file} 9 | export enable_deepspeed=false 10 | export enable_ptv2=false 11 | export enable_lora=true 12 | export load_in_bit=4 13 | 14 | 15 | #export CUDA_VISIBLE_DEVICES="0,1,2,3" 16 | 17 | usage() { echo "Usage: $0 [-m ]" 1>&2; exit 1; } 18 | 19 | 20 | while getopts m: opt 21 | do 22 | case "${opt}" in 23 | m) mode=${OPTARG};; 24 | *) 25 | usage 26 | ;; 27 | esac 28 | done 29 | 30 | if [ "${mode}" != "dataset" ] && [ "${mode}" != "train" ] ; then 31 | usage 32 | fi 33 | 34 | if [[ "${mode}" == "dataset" ]] ; then 35 | python ../data_utils.py 36 | exit 0 37 | fi 38 | 39 | if [[ "${trainer_backend}" == "pl" ]] ; then 40 | # pl 多卡 修改配置文件 devices 41 | 42 | ### 多机多卡训练 43 | 44 | # 例子 3个机器 每个机器 4个卡 45 | # 修改train.py Trainer num_nodes = 3 46 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=0 python train.py 47 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=1 python train.py 48 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=2 python train.py 49 | 50 | # pl 多卡 修改配置文件 devices 51 | 52 | python ../train.py 53 | elif [[ "${trainer_backend}" == "cl" ]] ; then 54 | # 多机多卡 55 | # colossalai run --nproc_per_node 1 --num_nodes 1 --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 56 | 57 | colossalai run --nproc_per_node 1 --num_nodes 1 ../train.py 58 | else 59 | # 多机多卡 60 | # --nproc_per_node=1 nnodes=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 61 | torchrun --nproc_per_node 1 --nnodes 1 ../train.py 62 | fi -------------------------------------------------------------------------------- /scripts/train_lora_int8.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export trainer_backend=pl 4 | 5 | train_file="../config/train_${trainer_backend}.yaml" 6 | 7 | # 强制覆盖配置文件 8 | export train_file=${train_file} 9 | export enable_deepspeed=false 10 | export enable_ptv2=false 11 | export enable_lora=true 12 | export load_in_bit=8 13 | export precision="16" 14 | 15 | #export CUDA_VISIBLE_DEVICES="0,1,2,3" 16 | 17 | usage() { echo "Usage: $0 [-m ]" 1>&2; exit 1; } 18 | 19 | 20 | while getopts m: opt 21 | do 22 | case "${opt}" in 23 | m) mode=${OPTARG};; 24 | *) 25 | usage 26 | ;; 27 | esac 28 | done 29 | 30 | if [ "${mode}" != "dataset" ] && [ "${mode}" != "train" ] ; then 31 | usage 32 | fi 33 | 34 | if [[ "${mode}" == "dataset" ]] ; then 35 | python ../data_utils.py 36 | exit 0 37 | fi 38 | 39 | if [[ "${trainer_backend}" == "pl" ]] ; then 40 | # pl 多卡 修改配置文件 devices 41 | 42 | ### 多机多卡训练 43 | 44 | # 例子 3个机器 每个机器 4个卡 45 | # 修改train.py Trainer num_nodes = 3 46 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=0 python train.py 47 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=1 python train.py 48 | # MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=2 python train.py 49 | 50 | # pl 多卡 修改配置文件 devices 51 | 52 | python ../train.py 53 | elif [[ "${trainer_backend}" == "cl" ]] ; then 54 | # 多机多卡 55 | # colossalai run --nproc_per_node 1 --num_nodes 1 --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 56 | 57 | colossalai run --nproc_per_node 1 --num_nodes 1 ../train.py 58 | else 59 | # 多机多卡 60 | # --nproc_per_node=1 nnodes=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT ../train.py 61 | torchrun --nproc_per_node 1 --nnodes 1 ../train.py 62 | fi -------------------------------------------------------------------------------- /config/train_cl.yaml: -------------------------------------------------------------------------------- 1 | 2 | includes: [global.yaml, petl.yaml, colossalai_strategy.yaml] 3 | # one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大 4 | data_backend: parquet 5 | 6 | 7 | # 目前仅ddp 支持lora 8 | # one of ddp,gemini,zero2,zero2_cpu,3d 9 | strategy: ddp 10 | 11 | output_dir: ./outputs_cl 12 | overwrite_output_dir: true 13 | num_train_epochs: 20 14 | max_steps: -1 15 | save_safetensors: false 16 | save_strategy: steps 17 | save_steps: 1000 18 | save_total_limit: 10 19 | seed: 42 20 | fp16: true 21 | do_train: true 22 | train_file: 23 | - ../data/*.json 24 | do_eval: false 25 | do_predict: false 26 | per_device_train_batch_size: 2 27 | per_device_eval_batch_size: 2 28 | gradient_accumulation_steps: 1 29 | evaluation_strategy: 'no' 30 | eval_steps: 100 31 | 32 | # 优化器,如果策略使用 gemini , 则 optim one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl 33 | # 如果策略使用非 gemini ,则 optim one of follow 34 | # one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl,lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused, 35 | # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit, 36 | # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit, 37 | # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp 38 | 39 | optim: adam_hybrid_cl 40 | 41 | # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau 42 | lr_scheduler_type: cosine 43 | torch_compile: false 44 | learning_rate: 2.0e-05 45 | adam_beta1: 0.9 46 | adam_beta2: 0.999 47 | adam_epsilon: 1.0e-08 48 | max_grad_norm: 1.0 49 | weight_decay: 0.0 50 | warmup_ratio: 0.03 51 | logging_strategy: steps 52 | logging_steps: 10 53 | tf32: false 54 | gradient_checkpointing: false 55 | max_seq_length: 512 56 | max_target_length: 100 57 | 58 | do_lower_case: null 59 | use_fast_tokenizer: false 60 | dataloader_drop_last: true 61 | dataloader_pin_memory: true 62 | dataloader_num_workers: 0 63 | log_level: info 64 | 65 | -------------------------------------------------------------------------------- /config/train_pl.yaml: -------------------------------------------------------------------------------- 1 | includes: [global.yaml, petl.yaml] 2 | 3 | devices: 1 4 | data_backend: parquet 5 | convert_onnx: false 6 | do_train: true 7 | train_file: 8 | - ../data/*.json 9 | max_epochs: 20 10 | max_steps: -1 11 | 12 | # *** optimizer 13 | # lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused, 14 | # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit, 15 | # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit, 16 | # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp 17 | 18 | # *** scheduler 19 | # linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau, cosine,cosine_with_restarts,polynomial, 20 | # constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau 21 | 22 | # 'scheduler_type': 'linear',# one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau 23 | # 'scheduler': None, 24 | # 切换scheduler类型 25 | # 'scheduler_type': 'WarmupCosine', 26 | # 'scheduler': None, 27 | 28 | # 'scheduler_type': 'ReduceLROnPlateau', 29 | # 'scheduler': None, 30 | 31 | # 'scheduler_type': 'Step', 32 | # 'scheduler':{ 'decay_rate': 0.999,'decay_steps': 100,'verbose': True}, 33 | 34 | # 'scheduler_type': 'CAWR', 35 | # 'scheduler':{'T_mult': 1, 'rewarm_epoch_num': 2, 'verbose': True}, 36 | 37 | # 'scheduler_type': 'CAL', 38 | # 'scheduler': {'rewarm_epoch_num': 2,'verbose': True}, 39 | 40 | optimizer: lion 41 | scheduler_type: CAWR 42 | scheduler: 43 | T_mult: 1 44 | rewarm_epoch_num: 0.5 45 | verbose: false 46 | optimizer_betas: !!python/tuple 47 | - 0.9 48 | - 0.999 49 | train_batch_size: 2 50 | eval_batch_size: 2 51 | test_batch_size: 2 52 | learning_rate: 2.0e-05 53 | adam_epsilon: 1.0e-08 54 | gradient_accumulation_steps: 1 55 | max_grad_norm: 1.0 56 | weight_decay: 0 57 | warmup_steps: 0 58 | output_dir: ./outputs_pl 59 | max_seq_length: 512 60 | do_lower_case: null 61 | 62 | # 预测最大长度, 保留字段 63 | max_target_length: 100 64 | use_fast_tokenizer: false 65 | dataloader_drop_last: true 66 | dataloader_pin_memory: true 67 | dataloader_num_workers: 0 68 | 69 | 70 | -------------------------------------------------------------------------------- /infer/infer.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/3/19 18:25 2 | # @Author : tk 3 | # @FileName: infer_chatyuan 4 | 5 | 6 | # 使用 7 | import torch 8 | from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration 9 | 10 | 11 | tokenizer = T5Tokenizer.from_pretrained("./ChatYuan-large-v1") 12 | model = T5ForConditionalGeneration.from_pretrained("./ChatYuan-large-v1",torch_dtype=torch.float16,) 13 | # 修改colab笔记本设置为gpu,推理更快 14 | device = torch.device('cuda') 15 | model.to(device) 16 | 17 | 18 | def preprocess(text): 19 | text = text.replace("\n", "\\n").replace("\t", "\\t") 20 | return text 21 | 22 | 23 | def postprocess(text): 24 | return text.replace("\\n", "\n").replace("\\t", "\t") 25 | 26 | 27 | def answer(text, sample=True, top_p=1, temperature=0.7): 28 | '''sample:是否抽样。生成任务,可以设置为True; 29 | top_p:0-1之间,生成的内容越多样''' 30 | text = preprocess(text) 31 | encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=768, return_tensors="pt").to( 32 | device) 33 | if not sample: 34 | out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512, 35 | num_beams=1, length_penalty=0.6) 36 | else: 37 | out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512, 38 | do_sample=True, top_p=top_p, temperature=temperature, no_repeat_ngram_size=3) 39 | out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True) 40 | return postprocess(out_text[0]) 41 | 42 | 43 | 44 | 45 | input_text0 = "帮我写一个请假条,我因为新冠不舒服,需要请假3天,请领导批准" 46 | input_text1 = "你能干什么" 47 | input_text2 = "写一封英文商务邮件给英国客户,表达因为物流延误,不能如期到达,我们可以赔偿贵公司所有损失" 48 | input_text3 = "写一个文章,题目是未来城市" 49 | input_text4 = "写一个诗歌,关于冬天" 50 | input_text5 = "从南京到上海的路线" 51 | input_text6 = "学前教育专业岗位实习中,在学生方面会存在问题,请提出改进措施。800字" 52 | input_text7 = "根据标题生成文章:标题:屈臣氏里的化妆品到底怎么样?正文:化妆品,要讲究科学运用,合理搭配。屈臣氏起码是正品连锁店。请继续后面的文字。" 53 | input_text8 = "帮我对比几款GPU,列出详细参数对比,并且给出最终结论" 54 | input_list = [input_text0, input_text1, input_text2, input_text3, input_text4, input_text5, input_text6, input_text7, input_text8] 55 | for i, input_text in enumerate(input_list): 56 | input_text = "用户:" + input_text + "\n小元:" 57 | print(f"示例{i}".center(50, "=")) 58 | output_text = answer(input_text) 59 | print(f"{input_text}{output_text}") -------------------------------------------------------------------------------- /config/petl.yaml: -------------------------------------------------------------------------------- 1 | ############## lora模块 2 | 3 | lora: 4 | with_lora: true # 是否启用模块 5 | lora_type: lora 6 | r: 8 7 | lora_alpha: 32 8 | lora_dropout: 0.1 9 | fan_in_fan_out: false 10 | # Bias type for Lora. Can be 'none', 'all' or 'lora_only'" 11 | bias: none 12 | # "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 13 | modules_to_save: null 14 | layers_to_transform: null 15 | layers_pattern: null 16 | 17 | # "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " 18 | # "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}" 19 | rank_pattern: {} 20 | 21 | # "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. " 22 | # "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}" 23 | 24 | alpha_pattern: {} 25 | adalora: 26 | with_lora: false # 是否启用模块 27 | lora_type: adalora 28 | r: 8 29 | lora_alpha: 32 30 | lora_dropout: 0.1 31 | fan_in_fan_out: false 32 | # Bias type for Lora. Can be 'none', 'all' or 'lora_only'" 33 | bias: none 34 | # "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 35 | modules_to_save: null 36 | layers_to_transform: null 37 | layers_pattern: null 38 | alpha_pattern: {} 39 | 40 | # Target Lora matrix dimension. 41 | target_r: 8 42 | #Intial Lora matrix dimension. 43 | init_r: 12 44 | #The steps of initial warmup. 45 | tinit: 0 46 | #The steps of final warmup 47 | tfinal: 0 48 | #Step interval of rank allocation. 49 | deltaT: 1 50 | #Hyperparameter of EMA. 51 | beta1: 0.85 52 | #Hyperparameter of EMA. 53 | beta2: 0.85 54 | #The orthogonal regularization coefficient. 55 | orth_reg_weight: 0.5 56 | 57 | #The total training steps. 58 | total_step: null 59 | 60 | #The saved rank pattern. 61 | rank_pattern: null 62 | 63 | ia3: 64 | with_lora: false # 是否启用模块 65 | fan_in_fan_out: false 66 | # "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 67 | modules_to_save: null 68 | init_ia3_weights: true 69 | 70 | ############## ptv2模块 71 | prompt: 72 | with_prompt: true 73 | prompt_type: prefix_tuning 74 | task_type: causal_lm 75 | prefix_projection: false 76 | num_virtual_tokens: 32 -------------------------------------------------------------------------------- /infer/infer_finetuning.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/3/19 18:15 2 | # @Author : tk 3 | # @FileName: infer 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import torch 9 | from deep_training.data_helper import ModelArguments, DataArguments, TrainingArguments 10 | from transformers import HfArgumentParser 11 | 12 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer,PetlArguments 13 | from data_utils import config_args, postprocess, NN_DataHelper,get_deepspeed_config 14 | 15 | 16 | def generate_text(base_model,text,device = torch.device('cuda:0'),max_length=128,prefix=''): 17 | input_text = prefix + "用户:" + text + "\n小元:" 18 | 19 | o = tokenizer.encode_plus(input_text, truncation=True, max_length=512, return_attention_mask=False,return_token_type_ids=False) 20 | input_ids= [o['input_ids']] 21 | input_ids = torch.tensor(input_ids, dtype=torch.int32,device=device) 22 | 23 | logits = base_model.generate(input_ids=input_ids, max_length=max_length, bos_token_id=config.decoder_start_token_id, 24 | pad_token_id=config.pad_token_id, 25 | eos_token_id=config.eos_token_id) 26 | 27 | 28 | out_text = tokenizer.decode(logits[0], skip_special_tokens=True) 29 | out_text = postprocess(out_text) 30 | return out_text 31 | 32 | 33 | deep_config = get_deepspeed_config() 34 | 35 | if __name__ == '__main__': 36 | parser = HfArgumentParser((ModelArguments, )) 37 | (model_args, ) = parser.parse_dict(config_args, allow_extra_keys=True) 38 | 39 | 40 | 41 | dataHelper = NN_DataHelper(model_args) 42 | tokenizer, config, _,_= dataHelper.load_tokenizer_and_config() 43 | 44 | pl_model = MyTransformer(config=config, model_args=model_args,torch_dtype=torch.float16,) 45 | 46 | ###################### 注意 选最新权重 47 | # 选择最新的权重 , 根据时间排序 选最新的 48 | 49 | if deep_config is None: 50 | train_weight = './best_ckpt/last-v3.ckpt' 51 | assert os.path.exists(train_weight) 52 | 53 | else: 54 | # 建议直接使用转换脚本命令 支持 deepspeed stage 0,1,2,3, 生成 ./best_ckpt/last/best.pt 权重文件 55 | # cd best_ckpt/last 56 | # python zero_to_fp32.py . best.pt 57 | train_weight = './best_ckpt/last/best.pt' 58 | 59 | 60 | #加载权重 61 | pl_model.load_sft_weight(train_weight) 62 | 63 | model = pl_model.get_llm_model() 64 | model.eval().half().cuda() 65 | 66 | input_list = [ 67 | "写一个诗歌,关于冬天", 68 | "晚上睡不着怎么办", 69 | "中国的首都是哪里", 70 | ] 71 | for text in input_list: 72 | output = generate_text(model,text) 73 | print('input',text) 74 | print('output',output) -------------------------------------------------------------------------------- /infer/infer_ptuning.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/4/2 22:49 2 | # @Author : tk 3 | # @FileName: infer_ptuning 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import torch 9 | from deep_training.data_helper import ModelArguments, TrainingArguments, DataArguments 10 | from transformers import HfArgumentParser,AutoConfig 11 | 12 | from data_utils import config_args, NN_DataHelper, postprocess 13 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer,PetlArguments,PromptArguments 14 | 15 | def generate_text(base_model,text,device = torch.device('cuda:0'),max_length=128): 16 | input_text = "用户:" + text + "\n小元:" 17 | 18 | o = tokenizer.encode_plus(input_text, truncation=True, max_length=512, return_attention_mask=False,return_token_type_ids=False) 19 | input_ids= [o['input_ids']] 20 | input_ids = torch.tensor(input_ids, dtype=torch.int32,device=device) 21 | 22 | logits = base_model.generate(input_ids=input_ids, max_length=max_length, bos_token_id=config.decoder_start_token_id, 23 | pad_token_id=config.pad_token_id, 24 | eos_token_id=config.eos_token_id) 25 | 26 | 27 | out_text = tokenizer.decode(logits[0], skip_special_tokens=True) 28 | out_text = postprocess(out_text) 29 | return out_text 30 | 31 | if __name__ == '__main__': 32 | config_args['seed'] = None 33 | parser = HfArgumentParser((ModelArguments,)) 34 | (model_args,) = parser.parse_dict(config_args, allow_extra_keys=True) 35 | 36 | 37 | 38 | dataHelper = NN_DataHelper(model_args) 39 | tokenizer, _, _, _ = dataHelper.load_tokenizer_and_config(config_kwargs={"torch_dtype": torch.float16}) 40 | 41 | ckpt_dir = './best_ckpt/last' 42 | config = AutoConfig.from_pretrained(ckpt_dir) 43 | prompt_args = PromptArguments.from_pretrained(ckpt_dir) 44 | 45 | assert prompt_args.inference_mode == True 46 | 47 | new_num_tokens = config.vocab_size 48 | if config.task_specific_params is not None and config.task_specific_params.get('vocab_size', None) is not None: 49 | config.vocab_size = config.task_specific_params['vocab_size'] 50 | 51 | pl_model = MyTransformer(config=config, model_args=model_args, prompt_args=prompt_args, 52 | torch_dtype=config.torch_dtype, 53 | new_num_tokens=new_num_tokens, 54 | ) 55 | # 加载sft权重 56 | pl_model.load_sft_weight(ckpt_dir) 57 | 58 | pl_model.eval().half().cuda() 59 | 60 | model = pl_model.get_llm_model() 61 | 62 | #基础模型精度 63 | model.base_model_torch_dtype = torch.half 64 | 65 | text = "写一个诗歌,关于冬天" 66 | output = generate_text(model, text) 67 | print('input', text) 68 | print('output', output) -------------------------------------------------------------------------------- /infer/infer_lora_finetuning.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/3/19 18:15 2 | # @Author : tk 3 | # @FileName: infer 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import torch 9 | from deep_training.data_helper import ModelArguments, DataArguments 10 | from transformers import HfArgumentParser 11 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer,PetlArguments 12 | from data_utils import config_args, postprocess, NN_DataHelper 13 | 14 | 15 | def generate_text(base_model,text,device = torch.device('cuda:0'),max_length=128,prefix=''): 16 | input_text = prefix + "用户:" + text + "\n小元:" 17 | 18 | o = tokenizer.encode_plus(input_text, truncation=True, max_length=512, return_attention_mask=False,return_token_type_ids=False) 19 | input_ids= [o['input_ids']] 20 | input_ids = torch.tensor(input_ids, dtype=torch.int32,device=device) 21 | 22 | logits = base_model.generate(input_ids=input_ids, max_length=max_length, bos_token_id=config.decoder_start_token_id, 23 | pad_token_id=config.pad_token_id, 24 | eos_token_id=config.eos_token_id) 25 | 26 | 27 | out_text = tokenizer.decode(logits[0], skip_special_tokens=True) 28 | out_text = postprocess(out_text) 29 | return out_text 30 | 31 | if __name__ == '__main__': 32 | 33 | parser = HfArgumentParser((ModelArguments, )) 34 | (model_args,) = parser.parse_dict(config_args,allow_extra_keys=True) 35 | 36 | dataHelper = NN_DataHelper(model_args) 37 | tokenizer, config, _,_= dataHelper.load_tokenizer_and_config() 38 | 39 | # 一般根据时间排序选最新的权重文件夹 40 | weight_dir = '../scripts/best_ckpt' 41 | lora_weight_dir = os.path.join(weight_dir, "last") 42 | 43 | lora_args = PetlArguments.from_pretrained(lora_weight_dir) 44 | assert lora_args.inference_mode == True 45 | 46 | new_num_tokens = config.vocab_size 47 | if config.task_specific_params is not None and config.task_specific_params.get('vocab_size', None) is not None: 48 | config.vocab_size = config.task_specific_params['vocab_size'] 49 | 50 | pl_model = MyTransformer(config=config,model_args=model_args,lora_args=lora_args, 51 | torch_dtype=config.torch_dtype, 52 | new_num_tokens=new_num_tokens, 53 | ) 54 | 55 | 56 | # 加载lora权重 57 | pl_model.load_sft_weight(lora_weight_dir) 58 | 59 | enable_merge_weight = False 60 | if enable_merge_weight: 61 | # 合并lora 权重 保存 62 | pl_model.save_sft_weight(os.path.join(lora_weight_dir, 'pytorch_model_merge.bin'), merge_lora_weight=True) 63 | else: 64 | model = pl_model.get_llm_model() 65 | model.eval().half().cuda() 66 | 67 | input_list = [ 68 | "写一个诗歌,关于冬天", 69 | "晚上睡不着怎么办", 70 | "中国的首都是哪里", 71 | ] 72 | prefix = '' 73 | for text in input_list: 74 | output = generate_text(model, text,prefix=prefix) 75 | print('input', text) 76 | print('output', output) -------------------------------------------------------------------------------- /infer/infer_lora_finetuning_loop.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/3/19 18:15 2 | # @Author : tk 3 | # @FileName: infer 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import torch 9 | from deep_training.data_helper import ModelArguments, DataArguments 10 | from transformers import HfArgumentParser 11 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer,PetlArguments 12 | from data_utils import config_args, postprocess, NN_DataHelper 13 | 14 | 15 | def generate_text(base_model,text,device = torch.device('cuda:0'),max_length=128,prefix=''): 16 | input_text = prefix + "用户:" + text + "\n小元:" 17 | 18 | o = tokenizer.encode_plus(input_text, truncation=True, max_length=512, return_attention_mask=False,return_token_type_ids=False) 19 | input_ids= [o['input_ids']] 20 | input_ids = torch.tensor(input_ids, dtype=torch.int32,device=device) 21 | 22 | logits = base_model.generate(input_ids=input_ids, max_length=max_length, bos_token_id=config.decoder_start_token_id, 23 | pad_token_id=config.pad_token_id, 24 | eos_token_id=config.eos_token_id) 25 | 26 | 27 | out_text = tokenizer.decode(logits[0], skip_special_tokens=True) 28 | out_text = postprocess(out_text) 29 | return out_text 30 | 31 | if __name__ == '__main__': 32 | 33 | parser = HfArgumentParser((ModelArguments, )) 34 | (model_args,) = parser.parse_dict(config_args,allow_extra_keys=True) 35 | 36 | dataHelper = NN_DataHelper(model_args) 37 | tokenizer, config, _,_= dataHelper.load_tokenizer_and_config() 38 | 39 | # 一般根据时间排序选最新的权重文件夹 40 | weight_dir = '../scripts/best_ckpt' 41 | lora_weight_dir = os.path.join(weight_dir, "last") 42 | 43 | lora_args = PetlArguments.from_pretrained(lora_weight_dir) 44 | assert lora_args.inference_mode == True 45 | 46 | new_num_tokens = config.vocab_size 47 | if config.task_specific_params is not None and config.task_specific_params.get('vocab_size', None) is not None: 48 | config.vocab_size = config.task_specific_params['vocab_size'] 49 | 50 | pl_model = MyTransformer(config=config,model_args=model_args,lora_args=lora_args, 51 | torch_dtype=config.torch_dtype, 52 | new_num_tokens=new_num_tokens, 53 | ) 54 | 55 | 56 | # 加载lora权重 57 | pl_model.load_sft_weight(lora_weight_dir) 58 | 59 | enable_merge_weight = False 60 | if enable_merge_weight: 61 | # 合并lora 权重 保存 62 | pl_model.save_sft_weight(os.path.join(lora_weight_dir, 'pytorch_model_merge.bin'), merge_lora_weight=True) 63 | else: 64 | model = pl_model.get_llm_model() 65 | model.eval().half().cuda() 66 | 67 | prefix = input("请输入前缀:") 68 | while True: 69 | text = input("请输入文本(quit or q or exit 退出程序):") 70 | if text.lower() in ['quit','q','exit']: 71 | break 72 | 73 | output = generate_text(model, text,prefix=prefix) 74 | print('input', text) 75 | print('output', output) -------------------------------------------------------------------------------- /args.MD: -------------------------------------------------------------------------------- 1 | 2 | ## 切换训练模式配置 3 | 修改 config/main.py 4 | # 模块配置, 默认启用lora 5 | enable_deepspeed = False 6 | enable_ptv2 = False 7 | enable_lora = True 8 | load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0 9 | 10 | ## optimizer 11 | # lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused, 12 | # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit, 13 | # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit, 14 | # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp 15 | 16 | ## scheduler 17 | # linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau, cosine,cosine_with_restarts,polynomial, 18 | # constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau 19 | 20 | ### 单机多卡 21 | ```text 22 | 可见的前两块卡 23 | config_args = { 24 | 'devices': 2, 25 | } 26 | 27 | # 第一块 和 第三块卡 28 | config_args = { 29 | 'devices': [0,2], 30 | } 31 | ``` 32 | 33 | ### 多机多卡训练 34 | ```text 35 | 例子 3个机器 每个机器 4个卡 36 | 修改train.py Trainer num_nodes = 3 37 | MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=0 python train.py 38 | MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=1 python train.py 39 | MASTER_ADDR=10.0.0.1 MASTER_PORT=6667 WORLD_SIZE=12 NODE_RANK=2 python train.py 40 | ``` 41 | 42 | 43 | ### 超大数据集 44 | 修改data_utils.py "data_backend": "lmdb" 45 | 46 | ## 精度训练 47 | Trainer.precision = '16' # 半精度训练 "32": "32-true", "16": "16-mixed", "bf16": "bf16-mixed" 48 | 49 | 50 | ### lora finetuning 51 | ```text 52 | global_args = { 53 | "load_in_8bit": False, # lora 如果显卡支持int8 可以开启 , 需安装依赖 pip install bitsandbytes 54 | "num_layers": -1, # 是否使用骨干网络的全部层数 , -1 表示全层, 否则只用只用N层 55 | "num_layers_key": "num_layers", 56 | } 57 | lora_info_args = { 58 | 'with_lora': True, # 是否启用lora模块 59 | 'r': 8, 60 | 'target_modules': ['query_key_value'], 61 | 'target_dtype': None, 62 | 'lora_alpha': 32, 63 | 'lora_dropout': 0.1, 64 | 'bias': 'none', # Bias type for Lora. Can be 'none', 'all' or 'lora_only'" 65 | 'modules_to_save' : None, # "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 66 | } 67 | ``` 68 | 69 | 70 | ### int高效训练方式 71 | lora int8 72 | ```text 73 | global_args = { 74 | "load_in_8bit": True, # lora 如果显卡支持int8 可以开启 , 需安装依赖 pip install bitsandbytes 75 | "num_layers": -1, # 是否使用骨干网络的全部层数 , -1 表示全层, 否则只用只用N层 76 | "num_layers_key": "num_layers", 77 | } 78 | lora_info_args = { 79 | 'with_lora': True, # 是否启用lora模块 80 | 'r': 8, 81 | 'target_modules': ['query_key_value'], 82 | 'target_dtype': None, 83 | 'lora_alpha': 32, 84 | 'lora_dropout': 0.1, 85 | 'bias': 'none', # Bias type for Lora. Can be 'none', 'all' or 'lora_only'" 86 | 'modules_to_save' : None, # "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 87 | } 88 | ``` 89 | 90 | 91 | -------------------------------------------------------------------------------- /infer/infer_muti_lora_finetuning.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/3/19 18:15 2 | # @Author : tk 3 | # @FileName: infer 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import torch 9 | from deep_training.data_helper import ModelArguments, DataArguments 10 | from transformers import HfArgumentParser 11 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer,PetlArguments,PetlModel 12 | from data_utils import config_args, postprocess, NN_DataHelper 13 | 14 | 15 | def generate_text(base_model,text,device = torch.device('cuda:0'),max_length=128): 16 | input_text = "用户:" + text + "\n小元:" 17 | 18 | o = tokenizer.encode_plus(input_text, truncation=True, max_length=512, return_attention_mask=False,return_token_type_ids=False) 19 | input_ids= [o['input_ids']] 20 | input_ids = torch.tensor(input_ids, dtype=torch.int32,device=device) 21 | 22 | logits = base_model.generate(input_ids=input_ids, max_length=max_length, bos_token_id=config.decoder_start_token_id, 23 | pad_token_id=config.pad_token_id, 24 | eos_token_id=config.eos_token_id) 25 | 26 | 27 | out_text = tokenizer.decode(logits[0], skip_special_tokens=True) 28 | out_text = postprocess(out_text) 29 | return out_text 30 | 31 | if __name__ == '__main__': 32 | 33 | parser = HfArgumentParser((ModelArguments, )) 34 | (model_args,) = parser.parse_dict(config_args,allow_extra_keys=True) 35 | 36 | dataHelper = NN_DataHelper(model_args) 37 | tokenizer, config, _,_= dataHelper.load_tokenizer_and_config() 38 | 39 | ckpt_dir = './best_ckpt/last' 40 | lora_args = PetlArguments.from_pretrained(ckpt_dir) 41 | assert lora_args.inference_mode == True 42 | 43 | new_num_tokens = config.vocab_size 44 | if config.task_specific_params is not None and config.task_specific_params.get('vocab_size', None) is not None: 45 | config.vocab_size = config.task_specific_params['vocab_size'] 46 | 47 | pl_model = MyTransformer(config=config,model_args=model_args,lora_args=lora_args, 48 | torch_dtype=config.torch_dtype, 49 | new_num_tokens=new_num_tokens, 50 | ) 51 | 52 | # 加载多个lora权重 53 | pl_model.load_sft_weight(ckpt_dir, adapter_name="default") 54 | 55 | # 加载多个lora权重 56 | # pl_model.load_sft_weight(ckpt_dir, adapter_name="yourname") 57 | 58 | # 加载多个lora权重 59 | # pl_model.load_sft_weight(ckpt_dir, adapter_name="yourname") 60 | pl_model.eval().half().cuda() 61 | 62 | # backbone model replaced PetlModel 63 | lora_model: PetlModel = pl_model.backbone 64 | 65 | text_list = ["写一个诗歌,关于冬天", 66 | "晚上睡不着应该怎么办", 67 | "从南京到上海的路线", 68 | ] 69 | 70 | # 基准模型推理 71 | with lora_model.disable_adapter(): 72 | for input in text_list: 73 | # lora_model 调用子对象方法 74 | output = generate_text(lora_model, input) 75 | print('input', input) 76 | print('output', output) 77 | 78 | lora_model.set_adapter(adapter_name='default') 79 | 80 | for input in text_list: 81 | # lora_model 调用子对象方法 82 | output = generate_text(lora_model, input) 83 | print('input', input) 84 | print('output', output) 85 | 86 | 87 | -------------------------------------------------------------------------------- /config/colossalai_strategy.yaml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | colossalai_strategy: 5 | "ddp": 6 | name: "ddp" 7 | broadcast_buffers: True 8 | bucket_cap_mb: 25 9 | find_unused_parameters: False 10 | check_reduction: False 11 | gradient_as_bucket_view: False 12 | static_graph: False 13 | "gemini": 14 | name: "gemini" 15 | chunk_config_dict: None 16 | chunk_init_device: None 17 | placement_policy: "static" 18 | shard_param_frac: 1.0 # only for static placement 19 | offload_optim_frac: 0.0 # only for static placement 20 | offload_param_frac: 0.0 # only for static placement 21 | warmup_non_model_data_ratio: 0.8 # only for auto placement 22 | steady_cuda_cap_ratio: 0.9 # only for auto placement 23 | precision: "fp16" 24 | pin_memory: False 25 | force_outputs_fp32: False 26 | strict_ddp_mode: False 27 | search_range_m: 32 28 | hidden_dim: None 29 | min_chunk_size_m: 32 30 | memstats: None 31 | gpu_margin_mem_ratio: 0.0 32 | initial_scale: 2 ** 16 33 | min_scale: 1 34 | growth_factor: 2 35 | backoff_factor: 0.5 36 | growth_interval: 1000 37 | hysteresis: 2 38 | max_scale: 2 ** 32 39 | max_norm: 1.0 40 | norm_type: 2.0 41 | verbose: False 42 | "zero2": 43 | name: zero2 44 | stage: 2 45 | precision: "fp16" 46 | initial_scale: 2 ** 32 47 | min_scale: 1 48 | growth_factor: 2 49 | backoff_factor: 0.5 50 | growth_interval: 1000 51 | hysteresis: 2 52 | max_scale: 2 ** 32 53 | max_norm: 1.0 54 | norm_type: 2.0 55 | reduce_bucket_size_in_m: 12 56 | communication_dtype: None 57 | overlap_communication: True 58 | cpu_offload: False 59 | verbose: False 60 | 61 | "zero2_cpu": 62 | name: zero2_cpu 63 | stage: 2 64 | precision: "fp16" 65 | initial_scale: 2 ** 32 66 | min_scale: 1 67 | growth_factor: 2 68 | backoff_factor: 0.5 69 | growth_interval: 1000 70 | hysteresis: 2 71 | max_scale: 2 ** 32 72 | max_norm: 1.0 73 | norm_type: 2.0 74 | reduce_bucket_size_in_m: 12 75 | communication_dtype: None 76 | overlap_communication: True 77 | cpu_offload: True 78 | verbose: False 79 | 80 | "3d": 81 | name: "3d" 82 | tp_size: 1 83 | pp_size: 1 84 | precision: "fp16" 85 | zero_stage: 0 86 | enable_all_optimization: False 87 | enable_fused_normalization: False 88 | enable_flash_attention: False 89 | enable_jit_fused: False 90 | enable_sequence_parallelism: False 91 | enable_sequence_overlap: False 92 | num_microbatches: None 93 | microbatch_size: None 94 | initial_scale: 2 ** 16 95 | min_scale: 1 96 | growth_factor: 2 97 | backoff_factor: 0.5 98 | growth_interval: 1000 99 | hysteresis: 2 100 | max_scale: 2 ** 32 101 | max_norm: 0 102 | broadcast_buffers: True 103 | ddp_bucket_cap_mb: 25 104 | find_unused_parameters: False 105 | check_reduction: False 106 | gradient_as_bucket_view: False 107 | static_graph: False 108 | zero_bucket_size_in_m: 12 109 | cpu_offload: False 110 | communication_dtype: None 111 | overlap_communication: True 112 | custom_policy: None 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ```text 4 | 2024-04-22 简化 5 | 2023-10-09 support accelerator trainer 6 | 2023-10-07 support colossalai trainer 7 | 2023-09-26 support transformers trainer 8 | 2023-08-02 增加 muti lora infer 例子, 手动升级 aigc_zoo , pip install -U git+https://github.com/ssbuild/deep_training.zoo.git --force-reinstall --no-deps 9 | 2023-06-13 support resize_token_embeddings 10 | 2023-06-01 支持lora deepspeed 训练,0.1.9 和 0.1.10合并 11 | 2023-05-27 add qlora transformers>=4.30 12 | 2023-05-24 升级 lora 13 | ``` 14 | 15 | ## update information 16 | - [deep_training](https://github.com/ssbuild/deep_training) 17 | - t5 训练精度推荐使用 32 18 | 19 | ## install 20 | - pip install -U -r requirements.txt 21 | - 如果无法安装, 可以切换官方源 pip install -i https://pypi.org/simple -U -r requirements.txt 22 | 23 | 24 | ## weight 25 | 26 | - [ChatYuan-large-v1](https://huggingface.co/ClueAI/ChatYuan-large-v1) 27 | - [ChatYuan-large-v2](https://huggingface.co/ClueAI/ChatYuan-large-v2) 28 | 29 | 30 | ## data 31 | [open data](https://github.com/ssbuild/open_data) 32 | ```text 33 | p prefix optional 34 | q question optional 35 | a answer must 36 | 37 | ``` 38 | ```json 39 | { 40 | "id": 0, 41 | "p": "我是qwen训练的模型", 42 | "paragraph": [ 43 | { 44 | "q": "你好", 45 | "a": "我是机器人,有什么可以帮助你的?" 46 | }, 47 | { 48 | "q": "从南京到上海的路线", 49 | "a": "你好,南京到上海的路线如下:1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站" 50 | } 51 | ] 52 | } 53 | 54 | ``` 55 | 56 | 或者 57 | 58 | ```json 59 | { 60 | "id": 0, 61 | "conversations": [ 62 | { 63 | "from": "system", 64 | "value": "我是qwen训练的模型" 65 | }, 66 | { 67 | "from": "user", 68 | "value": "你好" 69 | }, 70 | { 71 | "from": "assistant", 72 | "value": "我是机器人,有什么可以帮助你的?" 73 | }, 74 | { 75 | "from": "user", 76 | "value": "从南京到上海的路线" 77 | }, 78 | { 79 | "from": "assistant", 80 | "value": "你好,南京到上海的路线如下:1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站" 81 | } 82 | ] 83 | } 84 | ``` 85 | 86 | 87 | # 使用方法 88 | 默认不使用滑动窗口 89 | data_conf = { 90 | 'stride': 0, 91 | #滑动窗口 , 数据多则相应增大,否则减小 ,stride <=0 则禁用滑动窗口 92 | } 93 | 94 | 95 | 96 | 97 | 98 | ## infer 99 | # infer_finetuning.py 推理微调模型 100 | # infer_lora_finetuning.py 推理微调模型 101 | # infer_ptuning.py 推理p-tuning-v2微调模型 102 | python infer_finetuning.py 103 | 104 | 105 | 106 | ## training 107 | ```text 108 | # 制作数据 109 | cd scripts 110 | bash train_full.sh -m dataset 111 | or 112 | bash train_lora.sh -m dataset 113 | or 114 | bash train_ptv2.sh -m dataset 115 | 116 | 注: num_process_worker 为多进程制作数据 , 如果数据量较大 , 适当调大至cpu数量 117 | dataHelper.make_dataset_with_args(data_args.train_file,mixed_data=False, shuffle=True,mode='train',num_process_worker=0) 118 | 119 | # 全参数训练 120 | bash train_full.sh -m train 121 | 122 | # lora adalora ia3 123 | bash train_lora.sh -m train 124 | 125 | # ptv2 126 | bash train_ptv2.sh -m train 127 | ``` 128 | 129 | ## 训练参数 130 | [训练参数](args.MD) 131 | 132 | ## 友情链接 133 | 134 | - [pytorch-task-example](https://github.com/ssbuild/pytorch-task-example) 135 | - [tf-task-example](https://github.com/ssbuild/tf-task-example) 136 | - [chatmoss_finetuning](https://github.com/ssbuild/chatmoss_finetuning) 137 | - [chatglm_finetuning](https://github.com/ssbuild/chatglm_finetuning) 138 | - [t5_finetuning](https://github.com/ssbuild/t5_finetuning) 139 | - [llm_finetuning](https://github.com/ssbuild/llm_finetuning) 140 | - [llm_rlhf](https://github.com/ssbuild/llm_rlhf) 141 | - [chatglm_rlhf](https://github.com/ssbuild/chatglm_rlhf) 142 | - [t5_rlhf](https://github.com/ssbuild/t5_rlhf) 143 | - [rwkv_finetuning](https://github.com/ssbuild/rwkv_finetuning) 144 | - [baichuan_finetuning](https://github.com/ssbuild/baichuan_finetuning) 145 | 146 | ## 147 | 纯粹而干净的代码 -------------------------------------------------------------------------------- /data_processer.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/3/25 18:36 2 | # @Author : tk 3 | import copy 4 | from enum import Enum 5 | import numpy as np 6 | from transformers import PreTrainedTokenizer 7 | 8 | 9 | 10 | class DataStrategy(Enum): 11 | tunction = 1 12 | slidding = 2 13 | 14 | 15 | 16 | 17 | 18 | def build_template_chatyuan(query, answer = None,prefix=None, history=None): 19 | prompt = prefix or '' 20 | if history is not None: 21 | for q,a in history: 22 | prompt += "用户:{}小元:".format(q,a) 23 | prompt += "用户:{}小元:".format(query) 24 | if answer is not None: 25 | prompt += answer 26 | return prompt 27 | 28 | def build_template_default(query, answer = None,prefix=None, history=None): 29 | prompt = prefix or '' 30 | if history is not None: 31 | for q,a in history: 32 | prompt += "User: {}\nAssistant:{}".format(q,a) 33 | prompt += "User: {}\nAssistant:".format(query) 34 | if answer is not None: 35 | prompt += answer 36 | return prompt 37 | 38 | def build_template_tiger(query,answer = None,prefix=None, history=None): 39 | prompt = prefix or '' 40 | tok_ins = "\n\n### Instruction:\n" 41 | tok_res = "\n\n### Response:\n" 42 | if history is not None: 43 | for q,a in history: 44 | prompt += "{}{}{}{}".format(tok_ins,q,tok_res,a) 45 | 46 | prompt += "{}{}{}".format(tok_ins, query, tok_res) 47 | if answer is not None: 48 | prompt += answer 49 | return prompt 50 | 51 | 52 | #切换模板 53 | build_template = build_template_chatyuan 54 | 55 | class TokenTunction: 56 | 57 | @classmethod 58 | def final(cls,a_ids,b_ids,max_seq_length): 59 | seqlen = len(a_ids) 60 | decoder_seqlen = len(b_ids) 61 | 62 | attention_mask = [1] * seqlen 63 | decoder_attention_mask = [1] * decoder_seqlen 64 | 65 | pad_len = max_seq_length - seqlen 66 | if pad_len > 0: 67 | a_ids += [0] * pad_len 68 | attention_mask += [0] * pad_len 69 | 70 | pad_len = max_seq_length - decoder_seqlen 71 | if pad_len > 0: 72 | b_ids += [0] * pad_len 73 | decoder_attention_mask += [0] * pad_len 74 | 75 | labels = np.asarray(copy.deepcopy(b_ids[1:]) + [-100], dtype=np.int64) 76 | labels[decoder_seqlen-1:] = -100 77 | 78 | d = { 79 | 'input_ids': np.asarray(a_ids, dtype=np.int32), 80 | 'attention_mask': np.asarray(attention_mask , dtype=np.int32), 81 | 'seqlen': np.asarray(seqlen, dtype=np.int32), 82 | 'decoder_input_ids': np.asarray(b_ids, dtype=np.int32), 83 | 'decoder_attention_mask': np.asarray(decoder_attention_mask, dtype=np.int32), 84 | 'decoder_seqlen': np.asarray(decoder_seqlen, dtype=np.int32), 85 | 'labels': np.asarray(labels, dtype=np.int64) 86 | } 87 | return d 88 | @classmethod 89 | def process(cls, tokenizer: PreTrainedTokenizer, config, sup, max_seq_length, examples): 90 | ds = [] 91 | prefix, examples = examples 92 | for sid, (q, a) in enumerate(examples): 93 | a_ids = tokenizer.encode(text=build_template(q, history=examples[:sid]), add_special_tokens=False) 94 | b_ids = tokenizer.encode(text=a,add_special_tokens=False) 95 | while len(a_ids) > max_seq_length : 96 | a_ids.pop(0) 97 | while len(b_ids) > max_seq_length - 2: 98 | b_ids.pop(-1) 99 | b_ids = [config.decoder_start_token_id] + b_ids + [config.eos_token_id] 100 | ds.append(cls.final(a_ids,b_ids,max_seq_length)) 101 | return ds 102 | 103 | 104 | class TokenSlidding: 105 | @classmethod 106 | def process(cls, tokenizer: PreTrainedTokenizer,config,stride,sup, max_seq_length, examples): 107 | ds = [] 108 | prefix,examples = examples 109 | for sid, (q, a) in enumerate(examples): 110 | a_ids = tokenizer.encode(text=build_template(q, history=examples[:sid]), add_special_tokens=False) 111 | b_ids = tokenizer.encode(text=a) + [config.eos_token_id] 112 | 113 | input_ids_all = a_ids + b_ids 114 | 115 | pos = 0 116 | while pos < len(input_ids_all): 117 | input_ids = [config.bos_token_id] + input_ids_all[pos: pos + max_seq_length - 1] 118 | pos += stride 119 | 120 | ds.append({ 121 | 'input_ids': np.asarray(input_ids,dtype=np.int32), 122 | 'seqlen': np.asarray(len(input_ids),dtype=np.int32) 123 | }) 124 | return ds 125 | 126 | 127 | -------------------------------------------------------------------------------- /training/train_cl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/9/25 12:29 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import logging 9 | import math 10 | from contextlib import nullcontext 11 | import datasets 12 | import torch 13 | import transformers 14 | from deep_training.trainer.cl.trainer import TrainerCL 15 | from transformers import ( 16 | HfArgumentParser, 17 | default_data_collator, 18 | set_seed, 19 | ) 20 | from transformers.trainer_utils import get_last_checkpoint 21 | from transformers.utils import check_min_version, send_example_telemetry 22 | from transformers.utils.versions import require_version 23 | from data_utils import NN_DataHelper, config_args, get_deepspeed_config, global_args 24 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer, PetlArguments,LoraConfig,PromptArguments 25 | from deep_training.data_helper import ModelArguments, DataArguments,TrainingArgumentsCL 26 | 27 | assert global_args["trainer_backend"] == "cl" 28 | 29 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 30 | check_min_version("4.33.2") 31 | 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | # Setup logging 36 | logging.basicConfig( 37 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 38 | datefmt="%m/%d/%Y %H:%M:%S", 39 | handlers=[logging.StreamHandler(sys.stdout)], 40 | ) 41 | 42 | def main(): 43 | world_size, local_rank, process_index = int(os.environ.get("WORLD_SIZE", 1)), int( 44 | os.environ.get("LOCAL_RANK", 0)), int(os.environ.get("RANK", 0)) 45 | 46 | 47 | training_args: TrainingArgumentsCL 48 | parser = HfArgumentParser((ModelArguments, TrainingArgumentsCL, DataArguments, PetlArguments, PromptArguments), 49 | conflict_handler='resolve') 50 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args,allow_extra_keys=True,) 51 | lora_args = lora_args.config 52 | prompt_args = prompt_args.config 53 | 54 | 55 | 56 | dataHelper = NN_DataHelper(model_args, training_args, data_args) 57 | config_kwargs = {"torch_dtype": torch.float16} 58 | if global_args['config_merge']: 59 | config_kwargs.update(global_args['config_merge']) 60 | 61 | tokenizer, config, _, _ = dataHelper.load_tokenizer_and_config(config_kwargs=config_kwargs) 62 | 63 | # if process_index == 0: 64 | # dataHelper.make_dataset_all() 65 | 66 | is_bf16_supported = torch.cuda.is_bf16_supported() 67 | precision = global_args["precision"] 68 | if precision == "auto": 69 | # 精度 根据实际情况做调整 70 | if is_bf16_supported: 71 | precision = 'bf16' 72 | else: 73 | precision = '16' 74 | 75 | if global_args["quantization_config"] is not None and global_args["quantization_config"].load_in_8bit: 76 | precision = "32" 77 | 78 | 79 | if str(precision) == '16': 80 | training_args.fp16 = True 81 | elif str(precision) == 'bf16': 82 | training_args.bf16 = True 83 | else: 84 | training_args.fp16 = False 85 | training_args.bf16 = False 86 | 87 | 88 | # Log on each process the small summary: 89 | logger.warning( 90 | f"Process rank: {training_args.local_rank}" 91 | + f"16-bits training: {training_args.fp16}" 92 | ) 93 | logger.info(f"Training/evaluation parameters {training_args}") 94 | 95 | # Detecting last checkpoint. 96 | last_checkpoint = None 97 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 98 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 99 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 100 | raise ValueError( 101 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 102 | "Use --overwrite_output_dir to overcome." 103 | ) 104 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 105 | logger.info( 106 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 107 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 108 | ) 109 | 110 | # Set seed before initializing model. 111 | set_seed(training_args.seed) 112 | 113 | 114 | transformer_args = dict(config=config, model_args=model_args, training_args=training_args, lora_args=lora_args, 115 | prompt_args=prompt_args, 116 | quantization_config=global_args["quantization_config"], 117 | device_map={"": local_rank} if world_size > 1 else "auto", 118 | torch_dtype=torch.float16, 119 | new_num_tokens=len(tokenizer), # 可能扩充词 120 | ) 121 | 122 | if transformer_args["quantization_config"] is None: 123 | transformer_args.pop("device_map") 124 | 125 | with nullcontext(): 126 | pl_model = MyTransformer(**transformer_args) 127 | 128 | config.save_pretrained(training_args.output_dir) 129 | 130 | # 加载sft权重 131 | # pl_model.load_sft_weight('./best_ckpt/best.pt',is_trainable=True) 132 | 133 | pl_model = pl_model.float() 134 | 135 | train_datasets = None 136 | if training_args.do_train: 137 | train_datasets = dataHelper.load_distributed_random_sampler( 138 | dataHelper.load_dataset_files()["train_files"], 139 | with_load_memory=data_args.data_backend == 'record', 140 | collate_fn=dataHelper.collate_fn, 141 | batch_size=training_args.per_device_train_batch_size, 142 | drop_last=training_args.dataloader_drop_last, # 多卡建议扔掉 143 | num_processes=world_size, process_index=process_index, 144 | num_workers = training_args.dataloader_num_workers, 145 | pin_memory = training_args.dataloader_pin_memory, 146 | ) 147 | 148 | 149 | 150 | # Initialize our Trainer 151 | trainer = TrainerCL( 152 | model=pl_model, 153 | args=training_args, 154 | train_dataset=train_datasets, 155 | tokenizer=tokenizer, 156 | # Data collator will default to DataCollatorWithPadding, so we change it. 157 | data_collator=default_data_collator, 158 | ) 159 | 160 | # Training 161 | if training_args.do_train: 162 | checkpoint = None 163 | if training_args.resume_from_checkpoint is not None: 164 | checkpoint = training_args.resume_from_checkpoint 165 | elif last_checkpoint is not None: 166 | checkpoint = last_checkpoint 167 | trainer.train(resume_from_checkpoint=checkpoint) 168 | 169 | 170 | 171 | 172 | def _mp_fn(index): 173 | # For xla_spawn (TPUs) 174 | main() 175 | 176 | 177 | if __name__ == "__main__": 178 | main() 179 | -------------------------------------------------------------------------------- /training/train_pl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #reference: https://github.com/clue-ai/PromptCLUE/blob/main/Fine_tuning_PyTorch.ipynb 3 | import sys 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 6 | 7 | import torch 8 | from deep_training.data_helper import ModelArguments, DataArguments, TrainingArguments 9 | from deep_training.trainer.pl.modelcheckpoint import ModelCheckpointEx 10 | from lightning import Trainer 11 | from lightning.pytorch.strategies import DeepSpeedStrategy 12 | from transformers import HfArgumentParser 13 | from data_utils import NN_DataHelper, config_args,get_deepspeed_config, global_args 14 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer, PetlArguments,LoraConfig,PromptArguments 15 | 16 | 17 | assert global_args["trainer_backend"] == "pl" 18 | 19 | def main(): 20 | parser = HfArgumentParser((ModelArguments, TrainingArguments, DataArguments, PetlArguments, PromptArguments)) 21 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args) 22 | lora_args = lora_args.config 23 | prompt_args = prompt_args.config 24 | 25 | output_weight_dir = './best_ckpt' 26 | 27 | config_kwargs = {"torch_dtype": torch.float16} 28 | if global_args[ 'config_merge' ]: 29 | config_kwargs.update(global_args[ 'config_merge' ]) 30 | dataHelper = NN_DataHelper(model_args, training_args, data_args) 31 | tokenizer, config, label2id, id2label = dataHelper.load_tokenizer_and_config(config_kwargs=config_kwargs) 32 | 33 | # dataHelper.make_dataset_all() 34 | 35 | is_bf16_supported = torch.cuda.is_bf16_supported() 36 | 37 | precision = global_args["precision"] 38 | if precision == "auto": 39 | # 精度 根据实际情况做调整 40 | if is_bf16_supported: 41 | precision = 'bf16' 42 | else: 43 | precision = '16' 44 | 45 | if global_args["quantization_config"] is not None and global_args["quantization_config"].load_in_8bit: 46 | precision = "32" 47 | 48 | deepspeed_config = get_deepspeed_config(precision) 49 | strategy = 'ddp' if torch.cuda.device_count() > 1 else 'auto' 50 | if deepspeed_config is not None and len(deepspeed_config): 51 | strategy = DeepSpeedStrategy(config=deepspeed_config, ) 52 | 53 | checkpoint_callback = ModelCheckpointEx( 54 | # monitor='loss', 55 | dirpath=output_weight_dir, 56 | save_weights_only=True, 57 | save_last=True, 58 | # every_n_train_steps=2000 // training_args.gradient_accumulation_steps, 59 | every_n_epochs=1, 60 | lora_args=lora_args, 61 | prompt_args=prompt_args, 62 | # monitor="loss",mode = "min", save_top_k = 10 按loss存储10个模型 63 | monitor="step",mode = "max", 64 | save_top_k=10, # 按步存储最后10个模型 65 | ) 66 | 67 | 68 | 69 | trainer = Trainer( 70 | callbacks=[checkpoint_callback], 71 | max_epochs=training_args.max_epochs, 72 | max_steps=training_args.max_steps, 73 | accelerator="gpu", 74 | devices=data_args.devices, 75 | enable_progress_bar=True, 76 | default_root_dir=data_args.output_dir, 77 | gradient_clip_val=training_args.max_grad_norm, 78 | accumulate_grad_batches=training_args.gradient_accumulation_steps, 79 | num_sanity_val_steps=0, 80 | strategy=strategy, 81 | precision=precision # #可以自行尝试 "32": "32-true", "16": "16-mixed", "bf16": "bf16-mixed" 82 | # precision='16-mixed',#混合精度训练 83 | ) 84 | 85 | transformer_args = dict(config=config, model_args=model_args, training_args=training_args, lora_args=lora_args,prompt_args=prompt_args, 86 | quantization_config=global_args.get('quantization_config',None), 87 | device_map={"": trainer.local_rank} if trainer.world_size > 1 else "auto", 88 | torch_dtype=torch.float16, 89 | new_num_tokens=len(tokenizer), # 如果扩充词 90 | ) 91 | 92 | if transformer_args["quantization_config"] is None: 93 | transformer_args.pop("device_map") 94 | 95 | pl_model = MyTransformer(**transformer_args) 96 | 97 | config.save_pretrained(output_weight_dir) 98 | 99 | # 加载sft权重 100 | # pl_model.load_sft_weight('./best_ckpt/best.pt',is_trainable=True) 101 | 102 | pl_model = pl_model.float() 103 | 104 | def dataset_loader_filter_fn(dataset): 105 | print('*' * 30, 'total', len(dataset)) 106 | return dataset 107 | 108 | 109 | train_datasets = dataHelper.load_distributed_random_sampler( 110 | dataHelper.load_dataset_files()["train_files"], 111 | with_load_memory=data_args.data_backend == 'record', 112 | collate_fn=dataHelper.collate_fn, 113 | batch_size=training_args.train_batch_size, 114 | drop_last=True, # 多卡建议扔掉 115 | num_processes=trainer.world_size, process_index=trainer.global_rank, 116 | dataset_loader_filter_fn=dataset_loader_filter_fn, 117 | num_workers=0, #num_workers for Dataloader 118 | ) 119 | 120 | if train_datasets is not None: 121 | trainer.fit(pl_model, train_dataloaders=train_datasets) 122 | 123 | 124 | def _mp_fn(index): 125 | # For xla_spawn (TPUs) 126 | main() 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | 132 | 133 | # if data_args.convert_onnx: 134 | # # 加载权重 135 | # if not lora_args.with_lora: 136 | # pl_module = MyTransformer.load_from_checkpoint('./best.pt', 137 | # lora_args=lora_args, 138 | # config=config, 139 | # model_args=model_args, 140 | # training_args=training_args) 141 | # model_ = pl_module.get_llm_model() 142 | # #保存权重, 可选上传至huggingface 143 | # tokenizer: T5Tokenizer 144 | # config: T5Config 145 | # tokenizer.save_pretrained('chatyuan_finetuning') 146 | # config.save_pretrained('chatyuan_finetuning') 147 | # model_.save_pretrained('chatyuan_finetuning', push_to_hub = False,max_shard_size= "10GB") 148 | # 149 | # # #转换onnx 模型 150 | # # input_sample = ( 151 | # # ("input_ids", torch.ones(size=(1, 128), dtype=torch.int32)), 152 | # # ("attention_mask", torch.ones(size=(1, 128), dtype=torch.int32)), 153 | # # ("decoder_input_ids", torch.ones(size=(1, 128), dtype=torch.int32)), 154 | # # ("decoder_attention_mask", torch.ones(size=(1, 128), dtype=torch.int32)), 155 | # # ) 156 | # # input_names = ("input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask") 157 | # # output_names = ("pred_ids",) 158 | # # dynamic_axes = None or {"input_ids": [0, 1], "attention_mask": [0, 1], 159 | # # "decoder_input_ids": [0, 1], "decoder_attention_mask": [0, 1], 160 | # # "pred_ids": [0, 1]} 161 | # # pl_module.convert_to_onnx('./best.onnx', 162 | # # input_sample=input_sample, 163 | # # input_names=input_names, 164 | # # output_names=output_names, 165 | # # dynamic_axes=dynamic_axes) 166 | # -------------------------------------------------------------------------------- /training/train_ac.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/9/25 12:29 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import logging 9 | import math 10 | import datasets 11 | import torch 12 | import transformers 13 | from deep_training.trainer.ac.trainer import TrainerAC 14 | from transformers import ( 15 | HfArgumentParser, 16 | default_data_collator, 17 | set_seed, 18 | ) 19 | from transformers.trainer_utils import get_last_checkpoint 20 | from transformers.utils import check_min_version, send_example_telemetry 21 | from transformers.utils.versions import require_version 22 | from data_utils import NN_DataHelper, config_args, get_deepspeed_config, global_args 23 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer, PetlArguments,LoraConfig,PromptArguments 24 | from deep_training.data_helper import ModelArguments, DataArguments,TrainingArgumentsAC 25 | 26 | assert global_args["trainer_backend"] == "ac" 27 | 28 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 29 | check_min_version("4.33.2") 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | # Setup logging 34 | logging.basicConfig( 35 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 36 | datefmt="%m/%d/%Y %H:%M:%S", 37 | handlers=[logging.StreamHandler(sys.stdout)], 38 | ) 39 | 40 | def main(): 41 | training_args: TrainingArgumentsAC 42 | parser = HfArgumentParser((ModelArguments, TrainingArgumentsAC, DataArguments, PetlArguments, PromptArguments), 43 | conflict_handler='resolve') 44 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args,allow_extra_keys=True,) 45 | lora_args = lora_args.config 46 | prompt_args = prompt_args.config 47 | 48 | if training_args.should_log: 49 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 50 | transformers.utils.logging.set_verbosity_info() 51 | 52 | log_level = training_args.get_process_log_level() 53 | logger.setLevel(log_level) 54 | datasets.utils.logging.set_verbosity(log_level) 55 | transformers.utils.logging.set_verbosity(log_level) 56 | transformers.utils.logging.enable_default_handler() 57 | transformers.utils.logging.enable_explicit_format() 58 | 59 | dataHelper = NN_DataHelper(model_args, training_args, data_args) 60 | config_kwargs = {"torch_dtype": torch.float16} 61 | if global_args['config_merge']: 62 | config_kwargs.update(global_args['config_merge']) 63 | 64 | tokenizer, config, _, _ = dataHelper.load_tokenizer_and_config(config_kwargs=config_kwargs) 65 | 66 | # with training_args.main_process_first(desc="make_dataset_all"): 67 | # dataHelper.make_dataset_all() 68 | 69 | is_bf16_supported = torch.cuda.is_bf16_supported() 70 | precision = global_args["precision"] 71 | if precision == "auto": 72 | # 精度 根据实际情况做调整 73 | if is_bf16_supported: 74 | precision = 'bf16' 75 | else: 76 | precision = '16' 77 | 78 | if global_args["quantization_config"] is not None and global_args["quantization_config"].load_in_8bit: 79 | precision = "32" 80 | 81 | 82 | if str(precision) == '16': 83 | training_args.fp16 = True 84 | elif str(precision) == 'bf16': 85 | training_args.bf16 = True 86 | else: 87 | training_args.fp16 = False 88 | training_args.bf16 = False 89 | 90 | deepspeed_config = get_deepspeed_config(precision) 91 | if deepspeed_config: 92 | training_args.deepspeed = deepspeed_config 93 | 94 | # Log on each process the small summary: 95 | logger.warning( 96 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 97 | + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" 98 | ) 99 | logger.info(f"Training/evaluation parameters {training_args}") 100 | 101 | # Detecting last checkpoint. 102 | last_checkpoint = None 103 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 104 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 105 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 106 | raise ValueError( 107 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 108 | "Use --overwrite_output_dir to overcome." 109 | ) 110 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 111 | logger.info( 112 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 113 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 114 | ) 115 | 116 | # Set seed before initializing model. 117 | set_seed(training_args.seed) 118 | 119 | world_size,local_rank,process_index = training_args.world_size,training_args.local_rank,training_args.process_index 120 | 121 | transformer_args = dict(config=config, model_args=model_args, training_args=training_args, lora_args=lora_args, 122 | prompt_args=prompt_args, 123 | quantization_config=global_args["quantization_config"], 124 | device_map={"": local_rank} if world_size > 1 else "auto", 125 | torch_dtype=torch.float16, 126 | new_num_tokens=len(tokenizer), # 可能扩充词 127 | ) 128 | 129 | if transformer_args["quantization_config"] is None: 130 | transformer_args.pop("device_map") 131 | 132 | pl_model = MyTransformer(**transformer_args) 133 | 134 | config.save_pretrained(training_args.output_dir) 135 | 136 | # 加载sft权重 137 | # pl_model.load_sft_weight('./best_ckpt/best.pt',is_trainable=True) 138 | 139 | pl_model = pl_model.float() 140 | 141 | train_datasets = None 142 | if training_args.do_train: 143 | train_datasets = dataHelper.load_distributed_random_sampler( 144 | dataHelper.load_dataset_files()["train_files"], 145 | with_load_memory=data_args.data_backend == 'record', 146 | collate_fn=dataHelper.collate_fn, 147 | batch_size=training_args.per_device_train_batch_size, 148 | drop_last=training_args.dataloader_drop_last, # 多卡建议扔掉 149 | num_processes=world_size, process_index=process_index, 150 | num_workers = training_args.dataloader_num_workers, 151 | pin_memory = training_args.dataloader_pin_memory, 152 | ) 153 | 154 | 155 | 156 | # Initialize our Trainer 157 | trainer = TrainerAC( 158 | model=pl_model, 159 | args=training_args, 160 | train_dataset=train_datasets, 161 | tokenizer=tokenizer, 162 | # Data collator will default to DataCollatorWithPadding, so we change it. 163 | data_collator=default_data_collator, 164 | ) 165 | 166 | # Training 167 | if training_args.do_train: 168 | checkpoint = None 169 | if training_args.resume_from_checkpoint is not None: 170 | checkpoint = training_args.resume_from_checkpoint 171 | elif last_checkpoint is not None: 172 | checkpoint = last_checkpoint 173 | trainer.train(resume_from_checkpoint=checkpoint) 174 | 175 | 176 | 177 | 178 | def _mp_fn(index): 179 | # For xla_spawn (TPUs) 180 | main() 181 | 182 | 183 | if __name__ == "__main__": 184 | main() 185 | -------------------------------------------------------------------------------- /config/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/5/31 14:43 4 | import json 5 | import os 6 | import torch 7 | import yaml 8 | from transformers import BitsAndBytesConfig 9 | from transformers.utils import strtobool 10 | from deep_training.zoo.constants.define import (TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, 11 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, 12 | TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, 13 | TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING) 14 | 15 | # 按需修改 16 | # TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING 17 | # TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 18 | # TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING 19 | # TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING 20 | 21 | from deep_training.utils.wrapper import load_yaml 22 | 23 | 24 | 25 | # 加载 26 | __CUR_PATH__ = os.path.abspath(os.path.dirname(__file__)) 27 | 28 | 29 | config_args = load_yaml(os.environ.get('train_file', os.path.join(__CUR_PATH__, 'train_pl.yaml'))) 30 | global_args = config_args.pop("global_args") 31 | global_models_mapper = config_args.pop("global_models_mapper") 32 | colossalai_strategy = config_args.pop("colossalai_strategy", {}) 33 | train_model_config = global_models_mapper[global_args["model_name"]] 34 | 35 | 36 | 37 | def merge_from_env(global_args): 38 | merge_config = {} 39 | if "trainer_backend" in os.environ: 40 | merge_config["trainer_backend"] = str(os.environ["trainer_backend"]) 41 | if "enable_deepspeed" in os.environ: 42 | merge_config["enable_deepspeed"] = strtobool(os.environ["enable_deepspeed"]) 43 | if "enable_ptv2" in os.environ: 44 | merge_config["enable_ptv2"] = strtobool(os.environ["enable_ptv2"]) 45 | if "enable_lora" in os.environ: 46 | merge_config["enable_lora"] = strtobool(os.environ["enable_lora"]) 47 | if "load_in_bit" in os.environ: 48 | merge_config["load_in_bit"] = int(os.environ["load_in_bit"]) 49 | 50 | if "precision" in os.environ: 51 | merge_config[ "precision" ] = int(os.environ[ "precision" ]) 52 | 53 | if merge_config: 54 | global_args.update(merge_config) 55 | 56 | merge_from_env(global_args) 57 | 58 | 59 | def patch_args(config_args): 60 | assert global_args[ "trainer_backend" ] in [ "pl", "hf", "cl", "ac" ] 61 | 62 | # ensure str 63 | global_args[ "precision" ] = str(global_args[ "precision" ]) 64 | 65 | if global_args[ "quantization_config" ]: 66 | # 精度 67 | if global_args[ "precision" ] == "auto": 68 | global_args[ "quantization_config" ][ 69 | "bnb_4bit_compute_dtype" ] = "bfloat16" if torch.cuda.is_bf16_supported() else "float16" 70 | 71 | global_args[ "quantization_config" ] = BitsAndBytesConfig(**global_args[ "quantization_config" ]) 72 | 73 | assert global_args["enable_lora"] + global_args["enable_ptv2"] <= 1 , ValueError("lora ptv2 cannot open at same time") 74 | 75 | #更新模型配置 76 | config_args.update(train_model_config) 77 | 78 | if global_args["trainer_backend"] == "cl": 79 | config_args["strategy"] = colossalai_strategy[config_args["strategy"]] 80 | 81 | if global_args['quantization_config'] is not None: 82 | global_args['quantization_config'].load_in_4bit = global_args["load_in_bit"] == 4 83 | global_args['quantization_config'].load_in_8bit = global_args["load_in_bit"] == 8 84 | if global_args["load_in_bit"] == 0: 85 | global_args["quantization_config"] = None 86 | 87 | 88 | 89 | if global_args["enable_lora"]: 90 | # 检查lora adalora是否开启 91 | assert config_args.get('lora', {}).get('with_lora', False) + \ 92 | config_args.get('adalora', {}).get('with_lora', False) + \ 93 | config_args.get('ia3', {}).get('with_lora', False) == 1, ValueError( 94 | 'lora adalora ia3 can set one at same time !') 95 | 96 | model_type = train_model_config['model_type'] 97 | if config_args.get('lora', {}).get('with_lora', False): 98 | config_args["lora"]["target_modules"] = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_type] 99 | elif config_args.get('adalora', {}).get('with_lora', False): 100 | config_args["adalora"]["target_modules"] = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[model_type] 101 | else: 102 | config_args["ia3"]["target_modules"] = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_type] 103 | config_args["ia3"]["feedforward_modules"] = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[model_type] 104 | 105 | config_args.pop('prompt', None) 106 | 107 | elif global_args["enable_ptv2"]: 108 | config_args.pop('lora', None) 109 | config_args.pop('adalora', None) 110 | config_args.pop('ia3', None) 111 | if "gradient_checkpointing" in config_args: 112 | config_args[ "gradient_checkpointing" ] = False 113 | 114 | assert "prompt" in config_args 115 | config_args["prompt"]["with_prompt"] = True 116 | else: 117 | config_args.pop('lora',None) 118 | config_args.pop('adalora', None) 119 | config_args.pop('ia3', None) 120 | config_args.pop('prompt', None) 121 | 122 | # 预处理 123 | if 'rwkv' in (config_args['tokenizer_name'] or config_args['model_name_or_path']).lower(): 124 | config_args['use_fast_tokenizer'] = True 125 | 126 | 127 | 128 | patch_args(config_args) 129 | 130 | def get_deepspeed_config(precision='fp16'): 131 | ''' 132 | lora prompt finetuning deepspeed_offload.json 133 | 普通 finetuning deepspeed.json 134 | ''' 135 | # 是否开启deepspeed 136 | if not global_args["enable_deepspeed"]: 137 | return None 138 | precision = str(precision).lower() 139 | # 选择 deepspeed 配置文件 140 | is_need_update_config = False 141 | if global_args["enable_lora"] or global_args["enable_ptv2"]: 142 | is_need_update_config = True 143 | filename = os.path.join(os.path.dirname(__file__), 'deepspeed_offload.json') 144 | else: 145 | filename = os.path.join(os.path.dirname(__file__), 'deepspeed.json') 146 | 147 | 148 | with open(filename, mode='r', encoding='utf-8') as f: 149 | deepspeed_config = json.loads(f.read()) 150 | 151 | #lora offload 同步优化器配置 152 | if is_need_update_config: 153 | optimizer = deepspeed_config.get('optimizer',None) 154 | if optimizer: 155 | if global_args["trainer_backend"] == 'hf': 156 | optimizer[ 'params' ][ 'betas' ] = (config_args.get('adam_beta1', 0.9),config_args.get('adam_beta2', 0.999),) 157 | optimizer[ 'params' ][ 'lr' ] = config_args.get('learning_rate', 2e-5) 158 | optimizer[ 'params' ][ 'eps' ] = config_args.get('adam_epsilon', 1e-8) 159 | # deepspeed_offload 优化器有效 160 | config_args[ 'optim' ] = optimizer[ 'type' ] 161 | else: 162 | optimizer['params']['betas'] = config_args.get('optimizer_betas', (0.9, 0.999)) 163 | optimizer['params']['lr'] = config_args.get('learning_rate', 2e-5) 164 | optimizer['params']['eps'] = config_args.get('adam_epsilon', 1e-8) 165 | # deepspeed_offload 优化器有效 166 | config_args['optimizer'] = optimizer['type'] 167 | 168 | if precision == 'bf16': 169 | if 'fp16' in deepspeed_config: 170 | deepspeed_config["fp16"]["enbale"] = False 171 | if 'bf16' in deepspeed_config: 172 | deepspeed_config["bf16"]["enbale"] = True 173 | else: 174 | deepspeed_config['bf16'] = {"enbale": True} 175 | elif precision == 'fp16': 176 | if 'bf16' in deepspeed_config: 177 | deepspeed_config["bf16"]["enbale"] = False 178 | 179 | return deepspeed_config 180 | 181 | -------------------------------------------------------------------------------- /training/train_hf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : ssbuild 3 | # @Time : 2023/9/25 12:29 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))) 7 | 8 | import logging 9 | import math 10 | import datasets 11 | import torch 12 | import transformers 13 | from deep_training.trainer.hf.trainer import TrainerHF 14 | from transformers import ( 15 | HfArgumentParser, 16 | default_data_collator, 17 | set_seed, 18 | ) 19 | from transformers.trainer_utils import get_last_checkpoint 20 | from transformers.utils import check_min_version, send_example_telemetry 21 | from transformers.utils.versions import require_version 22 | from data_utils import NN_DataHelper, config_args, get_deepspeed_config, global_args 23 | from deep_training.zoo.model_zoo.t5.llm_model import MyTransformer, PetlArguments,LoraConfig,PromptArguments 24 | from deep_training.data_helper import ModelArguments, DataArguments,TrainingArgumentsHF 25 | 26 | assert global_args["trainer_backend"] == "hf" 27 | 28 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 29 | check_min_version("4.33.2") 30 | 31 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | # Setup logging 36 | logging.basicConfig( 37 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 38 | datefmt="%m/%d/%Y %H:%M:%S", 39 | handlers=[logging.StreamHandler(sys.stdout)], 40 | ) 41 | 42 | def main(): 43 | training_args: TrainingArgumentsHF 44 | parser = HfArgumentParser((ModelArguments, TrainingArgumentsHF, DataArguments, PetlArguments, PromptArguments), 45 | conflict_handler='resolve') 46 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args,allow_extra_keys=True,) 47 | lora_args = lora_args.config 48 | prompt_args = prompt_args.config 49 | 50 | if training_args.should_log: 51 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 52 | transformers.utils.logging.set_verbosity_info() 53 | 54 | log_level = training_args.get_process_log_level() 55 | logger.setLevel(log_level) 56 | datasets.utils.logging.set_verbosity(log_level) 57 | transformers.utils.logging.set_verbosity(log_level) 58 | transformers.utils.logging.enable_default_handler() 59 | transformers.utils.logging.enable_explicit_format() 60 | 61 | dataHelper = NN_DataHelper(model_args, training_args, data_args) 62 | config_kwargs = {"torch_dtype": torch.float16} 63 | if global_args['config_merge']: 64 | config_kwargs.update(global_args['config_merge']) 65 | 66 | tokenizer, config, _, _ = dataHelper.load_tokenizer_and_config(config_kwargs=config_kwargs) 67 | 68 | # with training_args.main_process_first(desc="make_dataset_all"): 69 | # dataHelper.make_dataset_all() 70 | 71 | is_bf16_supported = torch.cuda.is_bf16_supported() 72 | # 精度 根据实际情况做调整 73 | precision = global_args["precision"] 74 | if precision == "auto": 75 | if is_bf16_supported: 76 | precision = 'bf16' 77 | else: 78 | precision = '16' 79 | 80 | if global_args["quantization_config"] is not None and global_args["quantization_config"].load_in_8bit: 81 | precision = "32" 82 | 83 | 84 | if str(precision) == '16': 85 | training_args.fp16 = True 86 | elif str(precision) == 'bf16': 87 | training_args.bf16 = True 88 | else: 89 | training_args.fp16 = False 90 | training_args.bf16 = False 91 | 92 | deepspeed_config = get_deepspeed_config(precision) 93 | if deepspeed_config: 94 | training_args.deepspeed = deepspeed_config 95 | 96 | # Log on each process the small summary: 97 | logger.warning( 98 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 99 | + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" 100 | ) 101 | logger.info(f"Training/evaluation parameters {training_args}") 102 | 103 | # Detecting last checkpoint. 104 | last_checkpoint = None 105 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 106 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 107 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 108 | raise ValueError( 109 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 110 | "Use --overwrite_output_dir to overcome." 111 | ) 112 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 113 | logger.info( 114 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 115 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 116 | ) 117 | 118 | # Set seed before initializing model. 119 | set_seed(training_args.seed) 120 | 121 | world_size,local_rank,process_index = training_args.world_size,training_args.local_rank,training_args.process_index 122 | 123 | transformer_args = dict(config=config, model_args=model_args, training_args=training_args, lora_args=lora_args, 124 | prompt_args=prompt_args, 125 | quantization_config=global_args["quantization_config"], 126 | device_map={"": local_rank} if world_size > 1 else "auto", 127 | torch_dtype=torch.float16, 128 | new_num_tokens=len(tokenizer), # 可能扩充词 129 | ) 130 | 131 | if transformer_args["quantization_config"] is None: 132 | transformer_args.pop("device_map") 133 | 134 | pl_model = MyTransformer(**transformer_args) 135 | 136 | config.save_pretrained(training_args.output_dir) 137 | 138 | # 加载sft权重 139 | # pl_model.load_sft_weight('./best_ckpt/best.pt',is_trainable=True) 140 | 141 | pl_model = pl_model.float() 142 | 143 | train_datasets = None 144 | if training_args.do_train: 145 | train_datasets = dataHelper.load_distributed_random_sampler( 146 | dataHelper.load_dataset_files()["train_files"], 147 | with_load_memory=data_args.data_backend == 'record', 148 | collate_fn=dataHelper.collate_fn, 149 | batch_size=training_args.per_device_train_batch_size, 150 | drop_last=training_args.dataloader_drop_last, # 多卡建议扔掉 151 | num_processes=world_size, process_index=process_index, 152 | num_workers = training_args.dataloader_num_workers, 153 | pin_memory = training_args.dataloader_pin_memory, 154 | ) 155 | 156 | 157 | 158 | # Initialize our Trainer 159 | trainer = TrainerHF( 160 | model=pl_model, 161 | args=training_args, 162 | train_dataset=train_datasets, 163 | tokenizer=tokenizer, 164 | # Data collator will default to DataCollatorWithPadding, so we change it. 165 | data_collator=default_data_collator, 166 | ) 167 | 168 | # Training 169 | if training_args.do_train: 170 | checkpoint = None 171 | if training_args.resume_from_checkpoint is not None: 172 | checkpoint = training_args.resume_from_checkpoint 173 | elif last_checkpoint is not None: 174 | checkpoint = last_checkpoint 175 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 176 | trainer.save_model() # Saves the tokenizer too for easy upload 177 | 178 | metrics = train_result.metrics 179 | metrics["train_samples"] = len(train_datasets) 180 | trainer.log_metrics("train", metrics) 181 | trainer.save_metrics("train", metrics) 182 | trainer.save_state() 183 | 184 | 185 | 186 | 187 | def _mp_fn(index): 188 | # For xla_spawn (TPUs) 189 | main() 190 | 191 | 192 | if __name__ == "__main__": 193 | main() 194 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # @Time : 2023/1/22 16:22 2 | # @Author : tk 3 | # @FileName: data_utils.py 4 | 5 | import sys 6 | import os 7 | from functools import cache 8 | 9 | sys.path.append(os.path.abspath(os.path.dirname(__file__))) 10 | 11 | import glob 12 | import copy 13 | import json 14 | import random 15 | import typing 16 | import numpy as np 17 | import torch 18 | from deep_training.data_helper import DataHelper, ModelArguments, TrainingArguments, DataArguments, TrainingArgumentsHF, \ 19 | TrainingArgumentsCL, TrainingArgumentsAC 20 | from deep_training.zoo.model_zoo.t5.llm_model import PetlArguments,PromptArguments 21 | from fastdatasets.record import load_dataset as Loader, RECORD, WriterObject, gfile 22 | from tqdm import tqdm 23 | from transformers import T5Tokenizer, HfArgumentParser, T5Config 24 | from config import * 25 | from data_processer import DataStrategy, TokenTunction, TokenSlidding 26 | 27 | data_conf = { 28 | 'strategy': DataStrategy.tunction, # 数据策略选项 29 | DataStrategy.tunction: { 30 | 'sup': True, # 是否监督模式 31 | }, 32 | 33 | DataStrategy.slidding: { 34 | 'stride': int(config_args['max_seq_length'] / 3 * 2), 35 | 'sup': True, # 是否监督模式 36 | } 37 | 38 | } 39 | 40 | def preprocess(text): 41 | return text.replace("\n", "\\n").replace("\t", "\\t") 42 | 43 | def postprocess(text): 44 | return text.replace("\\n", "\n").replace("\\t", "\t") 45 | 46 | 47 | class NN_DataHelper(DataHelper): 48 | index = 1 49 | def __init__(self, *args,**kwargs): 50 | super(NN_DataHelper, self).__init__(*args,**kwargs) 51 | 52 | strategy = data_conf['strategy'] 53 | if strategy == DataStrategy.tunction: 54 | self.collate_fn = self.collate_fn_none_stride 55 | else: 56 | #滑动窗口模式 57 | self.collate_fn = self.collate_fn_stride 58 | 59 | 60 | def on_data_ready(self): 61 | self.index = -1 62 | 63 | # 切分词 64 | def on_data_process(self, data: typing.Any, mode: str): 65 | self.index += 1 66 | 67 | tokenizer: T5Tokenizer 68 | config: T5Config 69 | max_seq_length = self.max_seq_length_dict[mode] 70 | tokenizer = self.tokenizer # noqa 71 | config = self.config # noqa 72 | examples = data 73 | 74 | strategy = data_conf['strategy'] 75 | if strategy == DataStrategy.tunction: 76 | ds = TokenTunction.process(tokenizer, config=config, max_seq_length=max_seq_length, examples=examples, 77 | **data_conf[strategy]) 78 | elif strategy == DataStrategy.slidding: 79 | ds = TokenSlidding.process(tokenizer, config=config, max_seq_length=max_seq_length, examples=examples, 80 | **data_conf[strategy]) 81 | 82 | else: 83 | raise ValueError('Invalid strategy', strategy) 84 | if not ds: 85 | return None 86 | 87 | if self.index < 3: 88 | print(ds[0]) 89 | return ds 90 | 91 | 92 | 93 | def _get_paragraph(self, lines): 94 | D = [] 95 | for line_id, line in enumerate(lines): 96 | jd = json.loads(line) 97 | if not jd: 98 | continue 99 | paragraph = jd['paragraph'] 100 | if line_id < 10: 101 | print(paragraph) 102 | 103 | prefix = jd.get('p', '') 104 | paragraph = [(preprocess(session['q']), 105 | preprocess('\n'.join(session['a'])) if isinstance(session['a'], list) else preprocess( 106 | session['a'])) 107 | for session in paragraph] 108 | sub = [] 109 | for (q, a) in paragraph: 110 | assert len(a), ValueError('answer cannot empty') 111 | sub.append((preprocess(q), preprocess(a))) 112 | D.append((prefix, copy.deepcopy(sub))) 113 | return D 114 | 115 | def _get_messages(self, lines): 116 | D = [] 117 | for line_id, line in enumerate(lines): 118 | jd = json.loads(line) 119 | if not jd: 120 | continue 121 | conversations = jd['conversations'] 122 | if line_id < 10: 123 | print(conversations) 124 | 125 | paragraph = [] 126 | prefix = '' 127 | pair = [None, None] 128 | for m in conversations: 129 | if m["from"] == 'user': 130 | pair[0] = preprocess(m["value"]) 131 | elif m["from"] == 'assistant': 132 | pair[1] = preprocess(m["value"]) 133 | elif m["from"] == 'system': 134 | prefix = preprocess(m["value"]) 135 | if pair[0] is not None and pair[1] is not None: 136 | paragraph.append(tuple(pair)) 137 | pair[0], pair[1] = None, None 138 | 139 | sub = [] 140 | for (q, a) in paragraph: 141 | assert len(a), ValueError('answer cannot empty') 142 | sub.append((preprocess(q), preprocess(a))) 143 | D.append((prefix, copy.deepcopy(sub))) 144 | return D 145 | 146 | # 读取文件 147 | def on_get_corpus(self, files: typing.List, mode: str): 148 | D = [] 149 | files = sum([glob.glob(file) for file in files], []) 150 | for file in files: 151 | with open(file, mode='r', encoding='utf-8', newline='\n') as f: 152 | lines = f.readlines() 153 | is_new = False 154 | if len(lines) > 0: 155 | is_new = 'conversations' in json.loads(lines[0]) 156 | if is_new: 157 | D.extend(self._get_messages(lines)) 158 | else: 159 | D.extend(self._get_paragraph(lines)) 160 | return D 161 | 162 | 163 | 164 | def collate_fn_stride(self, batch): 165 | self.tokenizer: T5Tokenizer 166 | o = {} 167 | for i, b in enumerate(batch): 168 | if i == 0: 169 | for k in b: 170 | o[k] = [torch.tensor(b[k])] 171 | else: 172 | for k in b: 173 | o[k].append(torch.tensor(b[k])) 174 | for k in o: 175 | o[k] = torch.stack(o[k]) 176 | 177 | seqlens = o.pop('seqlen') 178 | max_len = torch.max(seqlens).numpy().tolist() 179 | 180 | bs = len(batch) 181 | pad_token_id = self.tokenizer.pad_token_id 182 | eos_token_id = self.tokenizer.eos_token_id 183 | decoder_start_token_id = self.config.decoder_start_token_id 184 | 185 | 186 | input_ids = torch.full((bs, max_len), pad_token_id, dtype=torch.long) 187 | attention_mask = torch.zeros(size=(bs, max_len), dtype=torch.long) 188 | decoder_input_ids = torch.full((bs, max_len), pad_token_id, dtype=torch.long) 189 | decoder_attention_mask = torch.zeros(size=(bs, max_len), dtype=torch.long) 190 | labels = torch.full((bs, max_len), -100, dtype=torch.long) 191 | 192 | a_maxlen, b_maxlen = 0, 0 193 | raw_input_ids = o.pop('input_ids') 194 | 195 | for (seqlen, ids, a_ids, a_mask, b_ids, b_mask, label) in zip(seqlens, raw_input_ids, input_ids, attention_mask, 196 | decoder_input_ids, decoder_attention_mask, 197 | labels): 198 | seqlen = seqlen.squeeze(-1).numpy().tolist() 199 | s = np.random.randint(1, seqlen - 1, dtype=np.int32).tolist() 200 | a_ids[:s] = ids[:s] 201 | a_ids[s] = eos_token_id 202 | a_mask[:s + 1] = 1 203 | 204 | if ids[0] != decoder_start_token_id: 205 | b_len = seqlen - s + 1 206 | b_ids[0] = decoder_start_token_id 207 | b_ids[1:b_len] = ids[s:seqlen] 208 | b_mask[:b_len] = 1 209 | label[:b_len- 1] = b_ids[1:b_len] 210 | else: 211 | b_len = seqlen - s 212 | b_ids[:b_len] = ids[s:seqlen] 213 | b_mask[:b_len] = 1 214 | label[:b_len - 1] = b_ids[1:b_len] 215 | 216 | a_maxlen = max(a_maxlen, s + 1) 217 | b_maxlen = max(b_maxlen, b_len) 218 | 219 | o['input_ids'] = input_ids[:, :a_maxlen].long() 220 | o['attention_mask'] = attention_mask[:, :a_maxlen].long() 221 | o['decoder_input_ids'] = decoder_input_ids[:, :b_maxlen].long() 222 | o['decoder_attention_mask'] = decoder_attention_mask[:, :b_maxlen].long() 223 | o['labels'] = labels[:, :b_maxlen].long() 224 | return o 225 | 226 | def collate_fn_none_stride(self, batch): 227 | self.tokenizer: T5Tokenizer 228 | o = {} 229 | for i, b in enumerate(batch): 230 | if i == 0: 231 | for k in b: 232 | o[k] = [torch.tensor(b[k])] 233 | else: 234 | for k in b: 235 | o[k].append(torch.tensor(b[k])) 236 | for k in o: 237 | o[k] = torch.stack(o[k]) 238 | 239 | seqlen = torch.sum(o.pop('seqlen')) 240 | decoder_seqlen = torch.sum(o.pop('decoder_seqlen')) 241 | 242 | o['input_ids'] = o['input_ids'][:,:seqlen].long() 243 | o['attention_mask'] = o['attention_mask'][:,:seqlen].long() 244 | o['decoder_input_ids'] = o['decoder_input_ids'][:,:decoder_seqlen].long() 245 | o['decoder_attention_mask'] = o['decoder_attention_mask'][:,:decoder_seqlen].long() 246 | o['labels'] = o['labels'][:,:decoder_seqlen].long() 247 | return o 248 | 249 | def make_dataset_all(self): 250 | data_args = self.data_args 251 | 252 | #schema for arrow parquet 253 | schema = { 254 | "input_ids": "int32_list", 255 | "attention_mask": "int32_list", 256 | "seqlen": "int32_list", 257 | "decoder_input_ids": "int32_list", 258 | "decoder_attention_mask": "int32_list", 259 | "decoder_seqlen": "int32_list", 260 | "labels": "int32_list", 261 | } 262 | # 缓存数据集 263 | if data_args.do_train: 264 | self.make_dataset_with_args(data_args.train_file, mixed_data=False, shuffle=True, mode='train', 265 | schema=schema) 266 | if data_args.do_eval: 267 | self.make_dataset_with_args(data_args.eval_file, mode='eval',schema=schema) 268 | if data_args.do_test: 269 | self.make_dataset_with_args(data_args.test_file, mode='test',schema=schema) 270 | 271 | # 记录缓存文件 272 | with open(os.path.join(data_args.output_dir,'intermediate_file_index.json'),mode='w',encoding='utf-8') as f: 273 | f.write(json.dumps({ 274 | "train_files": self.train_files, 275 | "eval_files": self.eval_files, 276 | "test_files": self.test_files, 277 | },ensure_ascii=False)) 278 | 279 | # 加载训练文件 280 | @cache 281 | def load_dataset_files(self): 282 | data_args = self.data_args 283 | if not data_args.convert_file: 284 | return { 285 | "train_files": self.train_files, 286 | "eval_files": self.eval_files, 287 | "test_files": self.test_files, 288 | } 289 | 290 | filename = os.path.join(data_args.output_dir, 'intermediate_file_index.json') 291 | assert os.path.exists(filename) , 'make you dataset firstly' 292 | with open(filename, mode='r', encoding='utf-8') as f: 293 | return json.loads(f.read()) 294 | 295 | 296 | if __name__ == '__main__': 297 | if global_args[ "trainer_backend" ] == "hf": 298 | parser = HfArgumentParser((ModelArguments, TrainingArgumentsHF, DataArguments, PetlArguments, PromptArguments), 299 | conflict_handler='resolve') 300 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args, 301 | allow_extra_keys=True, ) 302 | elif global_args[ "trainer_backend" ] == "pl": 303 | parser = HfArgumentParser((ModelArguments, TrainingArguments, DataArguments, PetlArguments, PromptArguments)) 304 | model_args, training_args, data_args, _, _ = parser.parse_dict(config_args) 305 | elif global_args[ "trainer_backend" ] == "cl": 306 | parser = HfArgumentParser((ModelArguments, TrainingArgumentsCL, DataArguments, PetlArguments, PromptArguments), 307 | conflict_handler='resolve') 308 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args, 309 | allow_extra_keys=True, ) 310 | else: 311 | parser = HfArgumentParser((ModelArguments, TrainingArgumentsAC, DataArguments, PetlArguments, PromptArguments), 312 | conflict_handler='resolve') 313 | model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(config_args, 314 | allow_extra_keys=True, ) 315 | 316 | dataHelper = NN_DataHelper(model_args, training_args, data_args) 317 | tokenizer, config, label2id, id2label = dataHelper.load_tokenizer_and_config() 318 | 319 | 320 | 321 | # 缓存数据集 322 | print(f'to make dataset is overwrite_cache {data_args.overwrite_cache}') 323 | dataHelper.make_dataset_all() 324 | 325 | print('make dataset complete!') 326 | print('check data !') 327 | dataset = dataHelper.load_sequential_sampler(dataHelper.load_dataset_files()["train_files"], 328 | with_load_memory=data_args.data_backend == 'record', 329 | batch_size = 1, 330 | collate_fn=dataHelper.collate_fn) 331 | 332 | print('total' , len(dataset)) 333 | for i,d in enumerate(dataset): 334 | print(d) 335 | if i > 3: 336 | break 337 | 338 | -------------------------------------------------------------------------------- /data/finetune_train_examples.json: -------------------------------------------------------------------------------- 1 | {"id": 1, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 2 | {"id": 2, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 3 | {"id": 3, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 4 | {"id": 4, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 5 | {"id": 5, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 6 | {"id": 6, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 7 | {"id": 7, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 8 | {"id": 8, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 9 | {"id": 9, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 10 | {"id": 10, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 11 | {"id": 11, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 12 | {"id": 12, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 13 | {"id": 13, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 14 | {"id": 14, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 15 | {"id": 15, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 16 | {"id": 16, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 17 | {"id": 17, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 18 | {"id": 18, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 19 | {"id": 19, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 20 | {"id": 20, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 21 | {"id": 21, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 22 | {"id": 22, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 23 | {"id": 23, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 24 | {"id": 24, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 25 | {"id": 25, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 26 | {"id": 26, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 27 | {"id": 27, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 28 | {"id": 28, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 29 | {"id": 29, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 30 | {"id": 30, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 31 | {"id": 31, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 32 | {"id": 32, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 33 | {"id": 33, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 34 | {"id": 34, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 35 | {"id": 35, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 36 | {"id": 36, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 37 | {"id": 37, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 38 | {"id": 38, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 39 | {"id": 39, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 40 | {"id": 40, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 41 | {"id": 41, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 42 | {"id": 42, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 43 | {"id": 43, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 44 | {"id": 44, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 45 | {"id": 45, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 46 | {"id": 46, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 47 | {"id": 47, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 48 | {"id": 48, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 49 | {"id": 49, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 50 | {"id": 50, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 51 | {"id": 51, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 52 | {"id": 52, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 53 | {"id": 53, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 54 | {"id": 54, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 55 | {"id": 55, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 56 | {"id": 56, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 57 | {"id": 57, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 58 | {"id": 58, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 59 | {"id": 59, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 60 | {"id": 60, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 61 | {"id": 61, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 62 | {"id": 62, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 63 | {"id": 63, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 64 | {"id": 64, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 65 | {"id": 65, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 66 | {"id": 66, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 67 | {"id": 67, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 68 | {"id": 68, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 69 | {"id": 69, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 70 | {"id": 70, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 71 | {"id": 71, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 72 | {"id": 72, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 73 | {"id": 73, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 74 | {"id": 74, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 75 | {"id": 75, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 76 | {"id": 76, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 77 | {"id": 77, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 78 | {"id": 78, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 79 | {"id": 79, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 80 | {"id": 80, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 81 | {"id": 81, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 82 | {"id": 82, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 83 | {"id": 83, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 84 | {"id": 84, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 85 | {"id": 85, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 86 | {"id": 86, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 87 | {"id": 87, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 88 | {"id": 88, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 89 | {"id": 89, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 90 | {"id": 90, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 91 | {"id": 91, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 92 | {"id": 92, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 93 | {"id": 93, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 94 | {"id": 94, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 95 | {"id": 95, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 96 | {"id": 96, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 97 | {"id": 97, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 98 | {"id": 98, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 99 | {"id": 99, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 100 | {"id": 100, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 101 | {"id": 101, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 102 | {"id": 102, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 103 | {"id": 103, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 104 | {"id": 104, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 105 | {"id": 105, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 106 | {"id": 106, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 107 | {"id": 107, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 108 | {"id": 108, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 109 | {"id": 109, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 110 | {"id": 110, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 111 | {"id": 111, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 112 | {"id": 112, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 113 | {"id": 113, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 114 | {"id": 114, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 115 | {"id": 115, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 116 | {"id": 116, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 117 | {"id": 117, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 118 | {"id": 118, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 119 | {"id": 119, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 120 | {"id": 120, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 121 | {"id": 121, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 122 | {"id": 122, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 123 | {"id": 123, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 124 | {"id": 124, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 125 | {"id": 125, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 126 | {"id": 126, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 127 | {"id": 127, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 128 | {"id": 128, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 129 | {"id": 129, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 130 | {"id": 130, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 131 | {"id": 131, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 132 | {"id": 132, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 133 | {"id": 133, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 134 | {"id": 134, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 135 | {"id": 135, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 136 | {"id": 136, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 137 | {"id": 137, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 138 | {"id": 138, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 139 | {"id": 139, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 140 | {"id": 140, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 141 | {"id": 141, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 142 | {"id": 142, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 143 | {"id": 143, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 144 | {"id": 144, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 145 | {"id": 145, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 146 | {"id": 146, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 147 | {"id": 147, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 148 | {"id": 148, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 149 | {"id": 149, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 150 | {"id": 150, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 151 | {"id": 151, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 152 | {"id": 152, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 153 | {"id": 153, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 154 | {"id": 154, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 155 | {"id": 155, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 156 | {"id": 156, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 157 | {"id": 157, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 158 | {"id": 158, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 159 | {"id": 159, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 160 | {"id": 160, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 161 | {"id": 161, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 162 | {"id": 162, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 163 | {"id": 163, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 164 | {"id": 164, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 165 | {"id": 165, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 166 | {"id": 166, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 167 | {"id": 167, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 168 | {"id": 168, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 169 | {"id": 169, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 170 | {"id": 170, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 171 | {"id": 171, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 172 | {"id": 172, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 173 | {"id": 173, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 174 | {"id": 174, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 175 | {"id": 175, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 176 | {"id": 176, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 177 | {"id": 177, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 178 | {"id": 178, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 179 | {"id": 179, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 180 | {"id": 180, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 181 | {"id": 181, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 182 | {"id": 182, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 183 | {"id": 183, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 184 | {"id": 184, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 185 | {"id": 185, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 186 | {"id": 186, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 187 | {"id": 187, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 188 | {"id": 188, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 189 | {"id": 189, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 190 | {"id": 190, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 191 | {"id": 191, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 192 | {"id": 192, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 193 | {"id": 193, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 194 | {"id": 194, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 195 | {"id": 195, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 196 | {"id": 196, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 197 | {"id": 197, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 198 | {"id": 198, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 199 | {"id": 199, "paragraph": [{"q": "从南京到上海的路线", "a": ["你好,南京到上海的路线如下:", "1. 南京到上海,可以乘坐南京地铁1号线,在南京站乘坐轨道交通1号线。", "2. 南京到浦东机场,可以搭乘上海地铁1号,在陆家嘴站乘坐地铁1线,在浦东国际机场站乘坐机场快线,前往上海浦东国际机场。", "3. 上海到南京,可以换乘上海地铁2号线,从南京站换乘地铁2线,再从南京南站换乘地铁1路,然后到达上海站"]}]} 200 | {"id": 200, "paragraph": [{"q": "写一个诗歌,关于冬天", "a": ["冬夜寂静冷,", "云在天边飘,", "冰封白雪上, ", "寒冷像一场雪。", " ", "雪花融化成冰,", "像那雪花飘洒,", "在寒冷的冬天,", "感受春天的喜悦。", " 冬日里,", "风雪渐消,", "一片寂静,", "把快乐和温暖带回家。"]}]} 201 | --------------------------------------------------------------------------------