├── source ├── model │ ├── llama2 │ │ ├── chatllms │ │ │ ├── __init__.py │ │ │ ├── server │ │ │ │ └── __init__.py │ │ │ ├── train │ │ │ │ ├── __init__.py │ │ │ │ └── training.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── stream_server.py │ │ │ │ ├── apply_lora.py │ │ │ │ └── logger_utils.py │ │ │ ├── data │ │ │ │ ├── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── convert_alpaca.py │ │ │ │ ├── __init__.py │ │ │ │ └── data_loader.py │ │ │ ├── evaluation │ │ │ │ └── __init__.py │ │ │ ├── __version__.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── sample_generate_callback.py │ │ │ │ ├── compute_metrics.py │ │ │ │ └── save_peft_model_callback.py │ │ │ └── configs │ │ │ │ ├── __init__.py │ │ │ │ ├── lora_args.py │ │ │ │ ├── quant_args.py │ │ │ │ ├── model_args.py │ │ │ │ ├── infer_args.py │ │ │ │ ├── train_args.py │ │ │ │ ├── gen_args.py │ │ │ │ └── data_args.py │ │ ├── assets │ │ │ └── wechat.jpg │ │ ├── scripts │ │ │ ├── server │ │ │ │ ├── gradio_base_webserver.sh │ │ │ │ ├── run_inference.sh │ │ │ │ ├── gradio_webserver.sh │ │ │ │ ├── gradio_qlora_webserver.sh │ │ │ │ └── apply_lora_to_base_model.sh │ │ │ ├── eval.sh │ │ │ ├── run.sh │ │ │ ├── full_finetune │ │ │ │ ├── full-finetune_ds.sh │ │ │ │ └── full-finetune.sh │ │ │ ├── lora_finetune │ │ │ │ ├── lora-finetune.sh │ │ │ │ └── lora-finetune_ds.sh │ │ │ ├── test_qlora_finetune.sh │ │ │ ├── qlora_finetune │ │ │ │ ├── finetune_baichuan_7b_vicuna_zh.sh │ │ │ │ ├── finetune_llama2_7b_alpaca_zh.sh │ │ │ │ ├── finetune_llama_7b_alpaca_zh.sh │ │ │ │ └── finetune_baichuan_7b_alpaca_zh.sh │ │ │ ├── ds_config │ │ │ │ ├── default_offload_opt_param.json │ │ │ │ └── ds_config_zero3_auto.json │ │ │ └── clean_data.sh │ │ ├── data │ │ │ ├── run_test.yaml │ │ │ ├── belle_group.yaml │ │ │ ├── vicuna_zh.yaml │ │ │ ├── alpaca_zh.yaml │ │ │ ├── alpaca_zh_pcyn.yaml │ │ │ ├── vicuna_zh_pcyn.yaml │ │ │ ├── dataset_info.yaml │ │ │ └── dataset_info.py │ │ ├── examples │ │ │ ├── clean_sharegpt │ │ │ │ ├── clean_evol_instruct.py │ │ │ │ └── merge.py │ │ │ ├── vllm │ │ │ │ ├── vllm_demo.py │ │ │ │ └── apil_chient.py │ │ │ ├── finetune_llm │ │ │ │ ├── baichuan7b_demo.py │ │ │ │ └── finetune_llama_with_qlora.py │ │ │ ├── format_data │ │ │ │ ├── merge.py │ │ │ │ ├── convert_oasst1.py │ │ │ │ ├── convert_vicuna.py │ │ │ │ └── convert_alpaca.py │ │ │ └── test_convdataset.py │ │ ├── requirements.txt │ │ ├── chatbot.py │ │ ├── server │ │ │ ├── multi_chat.py │ │ │ ├── single_chat.py │ │ │ └── gradio_base_webserver.py │ │ ├── train_qlora.py │ │ ├── train.py │ │ └── cli_demo.py │ ├── flan-t5 │ │ ├── sample_ablate.sh │ │ ├── run_ft.sh │ │ ├── run_lora.sh │ │ ├── run_prefix.sh │ │ ├── utils.py │ │ └── flan_seq2seq.py │ ├── deepspeed.json │ ├── flan_t5_predict.py │ ├── gpt_predict.py │ └── llama2_predict.py └── arch │ ├── self_knowledge │ └── sk.py │ ├── passage_relevance │ └── pr.py │ └── task_decomposition │ └── td.py ├── ra-isf.png ├── evaluation.png ├── retrieval_contriever ├── requirements.txt ├── README.md ├── example_scripts │ ├── contriever.sh │ └── mcontriever.sh ├── evaluate_retrieved_passages.py ├── preprocess.py ├── src │ ├── index.py │ ├── inbatch.py │ ├── dist_utils.py │ ├── slurm.py │ ├── normalize_text.py │ ├── moco.py │ └── contriever.py └── generate_passage_embeddings.py ├── requirement.txt ├── run.sh ├── test.py ├── config.py └── contriever_config.py /source/model/llama2/chatllms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/train/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/data/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ra-isf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/ra-isf.png -------------------------------------------------------------------------------- /evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/evaluation.png -------------------------------------------------------------------------------- /retrieval_contriever/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.11.0 2 | transformers==4.18.0 3 | beir==1.0.0 4 | -------------------------------------------------------------------------------- /source/model/llama2/assets/wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/source/model/llama2/assets/wechat.jpg -------------------------------------------------------------------------------- /source/model/llama2/scripts/server/gradio_base_webserver.sh: -------------------------------------------------------------------------------- 1 | python gradio_base_webserver.py \ 2 | --model_name_or_path /home/robin/work_dir/llm/llm_pretrain_model/baichuan 3 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/__version__.py: -------------------------------------------------------------------------------- 1 | """Version information.""" 2 | 3 | # The following line *must* be the last in the module, exactly as formatted: 4 | __version__ = '0.1.0' 5 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/server/run_inference.sh: -------------------------------------------------------------------------------- 1 | # generated_chat_vicuna 2 | CUDA_VISIBLE_DEVICES=0 python single_chat.py \ 3 | --model_name_or_path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/merged_model -------------------------------------------------------------------------------- /source/model/llama2/data/run_test.yaml: -------------------------------------------------------------------------------- 1 | 100PoisonMpts: 2 | hf_hub_url: 'damo/100PoisonMpts' 3 | local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json 4 | dataset_format: alpaca 5 | multi_turn: False 6 | -------------------------------------------------------------------------------- /source/model/flan-t5/sample_ablate.sh: -------------------------------------------------------------------------------- 1 | sample_fraction=(0.025 0.05 0.1 0.25 0.5) 2 | 3 | for (( sf=0; sf<5; sf=sf+1 )) do 4 | python flan_classification.py --train_sample_fraction ${sample_fraction[$sf]} & wait 5 | done 6 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/server/gradio_webserver.sh: -------------------------------------------------------------------------------- 1 | python gradio_webserver.py \ 2 | --model_name_or_path decapoda-research/llama-7b-hf \ 3 | --lora_model_name_or_path work_dir/oasst1-llama-7b/checkpoint-414/adapter_model 4 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/eval.sh: -------------------------------------------------------------------------------- 1 | python chatllms/evaluation/evaluate_zh.py \ 2 | --model_name_or_path ~/checkpoints/baichuan7b \ 3 | --split test \ 4 | --data_path ~/prompt_data/ceval-exam \ 5 | --output_dir ./work_dir/ceval_output 6 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/server/gradio_qlora_webserver.sh: -------------------------------------------------------------------------------- 1 | python gradio_qlora_webserver.py \ 2 | --model_name_or_path decapoda-research/llama-7b-hf \ 3 | --lora_model_name_or_path ./work_dir/oasst1-llama-7b/checkpoint-831/adapter_model \ 4 | --quant_type nf4 \ 5 | --double_quant \ 6 | --bits 4 \ 7 | --fp16 8 | -------------------------------------------------------------------------------- /source/model/flan-t5/run_ft.sh: -------------------------------------------------------------------------------- 1 | epochs=(2 5 10) 2 | lora_r=(2 4 8 16) 3 | dropout=(0.1 0.2) 4 | 5 | for (( epoch=0; epoch<3; epoch=epoch+1 )) do 6 | for ((r=0; r<4; r=r+1 )) do 7 | for (( d=0; d<2; d=d+1 )) do 8 | python flan_seq2seq.py --lora_r ${lora_r[$r]} --epochs ${epochs[$epoch]} --dropout ${dropout[$d]} & wait 9 | done 10 | done 11 | done 12 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/server/apply_lora_to_base_model.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python chatllms/utils/apply_lora.py \ 2 | --base-model-path ~/checkpoints/baichuan7b/ \ 3 | --lora-model-path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/checkpoint-15000 \ 4 | --target-model-path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/merged_model -------------------------------------------------------------------------------- /source/model/flan-t5/run_lora.sh: -------------------------------------------------------------------------------- 1 | epochs=(2 5 10) 2 | lora_r=(2 4 8 16) 3 | dropout=(0.1 0.2) 4 | 5 | for (( epoch=0; epoch<3; epoch=epoch+1 )) do 6 | for ((r=0; r<4; r=r+1 )) do 7 | for (( d=0; d<2; d=d+1 )) do 8 | python flan_seq2seq.py --peft_method "lora" --lora_r ${lora_r[$r]} --epochs ${epochs[$epoch]} --dropout ${dropout[$d]} & wait 9 | done 10 | done 11 | done 12 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .load_pretrain_model import load_model_tokenizer 2 | from .mmlueval_callback import MMLUEvalCallback 3 | from .sample_generate_callback import SampleGenerateCallback 4 | from .save_peft_model_callback import SavePeftModelCallback 5 | 6 | __all__ = [ 7 | 'load_model_tokenizer', 'MMLUEvalCallback', 'SampleGenerateCallback', 8 | 'SavePeftModelCallback' 9 | ] 10 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | deepspeed==0.10.1 3 | flash-attn==2.1.1 4 | jsonlines==3.1.0 5 | nltk==3.8.1 6 | numpy==1.24.4 7 | openai==0.27.8 8 | protobuf==4.24.0 9 | safetensors==0.3.2 10 | sentence-transformers==2.2.2 11 | sentencepiece==0.1.99 12 | spacy==2.2.4 13 | tiktoken==0.5.1 14 | tokenizers==0.15.0 15 | torch==2.0.1+cu118 16 | tqdm==4.66.1 17 | transformers==4.35.2 18 | uvicorn==0.23.2 19 | vllm==0.2.1.post1 20 | -------------------------------------------------------------------------------- /source/model/flan-t5/run_prefix.sh: -------------------------------------------------------------------------------- 1 | epochs=(5 10 15 20) 2 | prefix_tokens=(10 25 50 100) 3 | prefix_projection=(0 1) 4 | 5 | for (( epoch=0; epoch<4; epoch=epoch+1 )) do 6 | for ((pt=0; pt<4; pt=pt+1 )) do 7 | for (( proj=0; proj<2; proj=proj+1 )) do 8 | python flan_seq2seq.py --prefix_tokens ${prefix_tokens[$pt]} --epochs ${epochs[$epoch]} --prefix_projection ${prefix_projection[$proj]} & wait 9 | done 10 | done 11 | done 12 | -------------------------------------------------------------------------------- /source/model/llama2/examples/clean_sharegpt/clean_evol_instruct.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from clean_sharegpt import get_clean_data, json_dump 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--in-file', type=str) 8 | parser.add_argument('--out-file', type=str) 9 | args = parser.parse_args() 10 | 11 | clean_data2 = get_clean_data(args) 12 | json_dump(clean_data2, args.out_file) 13 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # nohup sh scripts/finetune/finetune_baichuan_7b_olcc.sh > run2.log 2>&1 & 3 | # nohup sh scripts/multiturn/full-finetune_alpaca_ds.sh > run2.log 2>&1 & 4 | nohup sh scripts/qlora_finetune/multiturn_llama_finetune.sh > run_vicuna_llama_1gpu.log 2>&1 & 5 | nohup sh scripts/qlora_finetune/multiturn_baichuan_finetune.sh > run_vicuna_baichuan_1gpu.log 2>&1 & 6 | nohup sh scripts/qlora_finetune/finetune_baichuan_7b_olcc.sh > run_zh_baichuan_1gpu.log 2>&1 & 7 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | python main.py --engine "llama2-13b" 2 | --base_model_path {base_model_path} \ 3 | --self_knowledge_model_path {self_knowledge_model_path} \ 4 | --passage_relevance_model_path {passage_relevance_model_path} \ 5 | --task_decomposition_model_path {task_decomposition_model_path} \ 6 | --data_path {data_path} \ 7 | --n_docs {Number of documents to retrieve per questions} \ 8 | --model_name_or_path {contriever_model_path} \ 9 | --passages_embedding "wikipedia_embeddings/*" \ -------------------------------------------------------------------------------- /source/model/llama2/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | accelerate 3 | accelerate @ git+https://github.com/huggingface/accelerate.git 4 | bitsandbytes==0.39.0 5 | datasets 6 | deepspeed 7 | einops==0.6.1 8 | evaluate>=0.4.0 9 | gradio 10 | jieba 11 | nltk>=3.8.1 12 | numpy 13 | peft 14 | peft @ git+https://github.com/huggingface/peft.git 15 | rouge-chinese 16 | rouge-score>=0.1.2 17 | sentencepiece 18 | tokenizers 19 | torch 20 | transformers>=4.28.0 21 | transformers @ git+https://github.com/huggingface/transformers.git 22 | wandb==0.15.3 23 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_args import DataArguments 2 | from .gen_args import GenerationArguments 3 | from .infer_args import ModelInferenceArguments 4 | from .lora_args import LoraArguments 5 | from .model_args import ModelArguments 6 | from .quant_args import QuantArguments 7 | from .train_args import TrainingArguments 8 | 9 | __all__ = [ 10 | 'DataArguments', 'GenerationArguments', 'ModelArguments', 11 | 'TrainingArguments', 'ModelInferenceArguments', 'LoraArguments', 12 | 'QuantArguments' 13 | ] 14 | -------------------------------------------------------------------------------- /source/model/deepspeed.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_allow_untested_optimizer": true, 4 | "fp16": { 5 | "enabled": "auto", 6 | "loss_scale": 0, 7 | "initial_scale_power": 16, 8 | "loss_scale_window": 1000, 9 | "hysteresis": 2, 10 | "min_loss_scale": 1 11 | }, 12 | "zero_optimization": { 13 | "stage": 2, 14 | "allgather_partitions": true, 15 | "allgather_bucket_size": 5e8, 16 | "overlap_comm": false, 17 | "reduce_scatter": true, 18 | "reduce_bucket_size": 5e8, 19 | "contiguous_gradients" : true 20 | } 21 | } -------------------------------------------------------------------------------- /retrieval_contriever/README.md: -------------------------------------------------------------------------------- 1 | ## Retrieval using Contriever 2 | 3 | We utilize the Retriever: [Contriever](https://github.com/facebookresearch/contriever). 4 | 5 | ## References 6 | 7 | ```bibtex 8 | @misc{izacard2021contriever, 9 | title={Unsupervised Dense Information Retrieval with Contrastive Learning}, 10 | author={Gautier Izacard and Mathilde Caron and Lucas Hosseini and Sebastian Riedel and Piotr Bojanowski and Armand Joulin and Edouard Grave}, 11 | year={2021}, 12 | url = {https://arxiv.org/abs/2112.09118}, 13 | doi = {10.48550/ARXIV.2112.09118}, 14 | } 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .conv_dataset import make_conversation_data_module 2 | from .data_loader import make_supervised_data_module 3 | from .data_utils import (extract_alpaca_prompt_dataset, 4 | extract_default_prompt_dataset, 5 | extract_random_prompt_dataset) 6 | from .sft_dataset import make_instruction_data_module 7 | 8 | __all__ = [ 9 | 'make_conversation_data_module', 'make_supervised_data_module', 10 | 'make_instruction_data_module', 'extract_random_prompt_dataset', 11 | 'extract_alpaca_prompt_dataset', 'extract_default_prompt_dataset' 12 | ] 13 | -------------------------------------------------------------------------------- /source/model/llama2/examples/vllm/vllm_demo.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | prompts = [ 4 | 'Hello, my name is', 5 | 'The president of the United States is', 6 | 'The capital of France is', 7 | 'The future of AI is', 8 | ] 9 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 10 | 11 | llm = LLM(model='decapoda-research/llama-7b-hf', gpu_memory_utilization=0.9) 12 | 13 | # Print the outputs. 14 | for i in range(10): 15 | outputs = llm.generate(prompts, sampling_params) 16 | for output in outputs: 17 | prompt = output.prompt 18 | generated_text = output.outputs[0].text 19 | print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}') 20 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/lora_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | 4 | @dataclass 5 | class LoraArguments: 6 | # lora中A矩阵的列数量和B矩阵的行数量 7 | lora_r: int = field(default=64, metadata={'help': 'Lora R dimension.'}) 8 | # 缩放因子 9 | lora_alpha: float = field(default=16, metadata={'help': ' Lora alpha.'}) 10 | # dropout,一种正则化方法,可以模仿集成学习 11 | lora_dropout: float = field(default=0.0, 12 | metadata={'help': 'Lora dropout.'}) 13 | # 每个GPU上可使用的显存大小,以MB为单位。默认是A100高端版本的80GB 14 | max_memory_MB: int = field(default=80000, 15 | metadata={'help': 'Free memory per gpu.'}) 16 | lora_weight_path: str = '' 17 | bias: str = 'none' 18 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/full_finetune/full-finetune_ds.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 train.py \ 2 | --model_name_or_path facebook/opt-125m \ 3 | --data_path ~/prompt_data/InstructionWild/instinwild_en.json \ 4 | --output_dir work_dir/alpaca_full-finetune \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy "no" \ 10 | --save_strategy "steps" \ 11 | --save_steps 500 \ 12 | --save_total_limit 5 \ 13 | --learning_rate 2e-5 \ 14 | --weight_decay 0. \ 15 | --warmup_ratio 0.03 \ 16 | --lr_scheduler_type "cosine" \ 17 | --logging_steps 1 \ 18 | --deepspeed "scripts/ds_config/ds_config_zero3_auto.json" 19 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/lora_finetune/lora-finetune.sh: -------------------------------------------------------------------------------- 1 | python train_lora.py \ 2 | --model_name_or_path facebook/opt-125m \ 3 | --dataset_name 100PoisonMpts \ 4 | --output_dir work_dir/lora-finetune \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy "no" \ 10 | --save_strategy "steps" \ 11 | --save_steps 500 \ 12 | --save_total_limit 5 \ 13 | --learning_rate 1e-4 \ 14 | --weight_decay 0. \ 15 | --warmup_ratio 0.03 \ 16 | --optim "adamw_torch" \ 17 | --lr_scheduler_type "cosine" \ 18 | --model_max_length 1024 \ 19 | --logging_steps 1 \ 20 | --do_train \ 21 | --do_eval \ 22 | --gradient_checkpointing True 23 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/full_finetune/full-finetune.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | --model_name_or_path facebook/opt-125m \ 3 | --dataset_name share_gpt \ 4 | --output_dir work_dir/full-finetune \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy "steps" \ 10 | --save_strategy "steps" \ 11 | --eval_steps 1000 \ 12 | --save_steps 1000 \ 13 | --save_total_limit 5 \ 14 | --logging_steps 1 \ 15 | --learning_rate 2e-5 \ 16 | --weight_decay 0. \ 17 | --warmup_ratio 0.03 \ 18 | --optim "adamw_torch" \ 19 | --lr_scheduler_type "cosine" \ 20 | --gradient_checkpointing True \ 21 | --model_max_length 128 \ 22 | --trust_remote_code \ 23 | --do_train \ 24 | --do_eval 25 | -------------------------------------------------------------------------------- /source/model/flan_t5_predict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import os 4 | import json 5 | 6 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 7 | 8 | 9 | def model_init(args): 10 | model_path = args.model_path 11 | device = torch.device("cuda:0") 12 | model = AutoModelForSeq2SeqLM.from_pretrained( 13 | model_path, 14 | torch_dtype=torch.float16, 15 | ).to(device) 16 | tokenizer = AutoTokenizer.from_pretrained(model_path) 17 | return model, tokenizer, device 18 | 19 | 20 | def predict(args, prompt, model, tokenizer): 21 | inputs = tokenizer(prompt, return_tensors="pt").to('cuda') 22 | generate_ids = model.generate(**inputs, temperature=args.temperature) 23 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1] 24 | infer_res = tokenizer.decode(generate_ids) 25 | return infer_res 26 | -------------------------------------------------------------------------------- /source/model/gpt_predict.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from openai import OpenAI 4 | 5 | def predict(args, prompt): 6 | my_key = args.api_key 7 | max_length = 256 8 | temperature = 0.0 9 | top_p = 1 10 | frequency_penalty = 0 11 | presence_penalty = 0 12 | client = OpenAI(api_key = my_key) 13 | prompt = " 14 | response = client.completions.create( 15 | model="gpt-3.5-turbo-instruct", # text-davinci-003 is deprecated 16 | prompt=prompt, 17 | max_tokens=max_length, 18 | temperature=temperature, 19 | top_p=top_p, 20 | frequency_penalty=frequency_penalty, 21 | presence_penalty=presence_penalty, 22 | # api_key=my_key, 23 | ) 24 | if args.engine == 'llama2-13b': 25 | raise NotImplementedError('Engine false when running gpt3.5: {}'.format(args.engine)) 26 | return response.choices[0].text -------------------------------------------------------------------------------- /source/model/llama2_predict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import os 4 | import json 5 | from transformers import LlamaTokenizer, LlamaForCausalLM, AutoConfig 6 | 7 | def model_init(model_path): 8 | # model_path = args.model_path 9 | device = torch.device("cuda:0") 10 | model = LlamaForCausalLM.from_pretrained( 11 | model_path, 12 | torch_dtype=torch.float16, 13 | ).to(device) 14 | tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=False) 15 | return model, tokenizer 16 | 17 | def predict(args, prompt, model, tokenizer): 18 | inputs = tokenizer(prompt, return_tensors="pt").to('cuda') 19 | generate_ids = model.generate(**inputs, max_length=args.max_length, temperature=args.temperature) 20 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1] 21 | infer_res = tokenizer.decode(generate_ids) 22 | return infer_res 23 | -------------------------------------------------------------------------------- /source/model/llama2/examples/clean_sharegpt/merge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Merge two conversation files into one 3 | 4 | Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json 5 | """ 6 | 7 | import argparse 8 | 9 | from clean_sharegpt import json_dump, json_load 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--in-file', type=str, required=True, nargs='+') 14 | parser.add_argument('--out-file', type=str, default='merged.json') 15 | args = parser.parse_args() 16 | 17 | new_content = [] 18 | for in_file in args.in_file: 19 | content = json_load(in_file) 20 | print(f'in-file: {in_file}, len: {len(content)}') 21 | new_content.extend(content) 22 | 23 | print(f'#out: {len(new_content)}') 24 | print(f'Save new_content to {args.out_file}') 25 | json_dump(new_content, args.out_file) 26 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/lora_finetune/lora-finetune_ds.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 train_lora.py \ 2 | --model_name_or_path facebook/opt-125m \ 3 | --data_path ~/prompt_data/InstructionWild/instinwild_en.json \ 4 | --output_dir work_dir/alpaca_full-finetune \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy "no" \ 10 | --save_strategy "steps" \ 11 | --save_steps 500 \ 12 | --save_total_limit 5 \ 13 | --learning_rate 2e-5 \ 14 | --weight_decay 0. \ 15 | --warmup_ratio 0.03 \ 16 | --optim "adamw_torch" \ 17 | --lr_scheduler_type "cosine" \ 18 | --model_max_length 2048 \ 19 | --logging_steps 1 \ 20 | --do_train \ 21 | --do_eval \ 22 | --gradient_checkpointing True \ 23 | --deepspeed "scripts/ds_config/ds_config_zero3_auto.json" 24 | -------------------------------------------------------------------------------- /source/model/llama2/examples/finetune_llm/baichuan7b_demo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | 4 | 5 | def main(load_in_8bit=True, model_path=''): 6 | tokenizer = AutoTokenizer.from_pretrained( 7 | pretrained_model_name_or_path=model_path, trust_remote_code=True) 8 | model = AutoModelForCausalLM.from_pretrained( 9 | pretrained_model_name_or_path=model_path, 10 | load_in_8bit=load_in_8bit, 11 | torch_dtype=torch.float16, 12 | device_map='auto', 13 | trust_remote_code=True) 14 | inputs = tokenizer('登鹳雀楼->王之涣\n夜雨寄北->', return_tensors='pt') 15 | inputs = inputs.to('cuda:0') 16 | pred = model.generate(**inputs, max_new_tokens=64) 17 | print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) 18 | 19 | 20 | if __name__ == '__main__': 21 | load_in_8bit = True 22 | model_path = '/home/robin/work_dir/llm/llm_pretrain_model/baichuan' 23 | main(load_in_8bit, model_path) 24 | -------------------------------------------------------------------------------- /source/model/llama2/chatbot.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import gradio as gr 3 | 4 | 5 | if __name__ == "__main__": 6 | openai.api_key = "Your API key" 7 | 8 | messages = [ 9 | {"role": "system", "content": "You are a helpful and kind AI Assistant."}, 10 | ] 11 | 12 | def chatbot(input): 13 | if input: 14 | messages.append({"role": "user", "content": input}) 15 | chat = openai.ChatCompletion.create( 16 | model="gpt-3.5-turbo", messages=messages 17 | ) 18 | reply = chat.choices[0].message.content 19 | messages.append({"role": "assistant", "content": reply}) 20 | return reply 21 | 22 | inputs = gr.inputs.Textbox(lines=7, label="Chat with AI") 23 | outputs = gr.outputs.Textbox(label="Reply") 24 | 25 | gr.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="AI Chatbot", 26 | description="Ask anything you want", 27 | theme="compact").launch(share=True) -------------------------------------------------------------------------------- /source/model/llama2/data/belle_group.yaml: -------------------------------------------------------------------------------- 1 | belle_0.5m: 2 | hf_hub_url: BelleGroup/train_0.5M_CN 3 | local_path: '' 4 | dataset_format: alpaca 5 | multi_turn: False 6 | 7 | belle_1m: 8 | hf_hub_url: BelleGroup/train_1M_CN 9 | local_path: '' 10 | dataset_format: alpaca 11 | multi_turn: False 12 | 13 | belle_2m: 14 | hf_hub_url: BelleGroup/train_2M_CN 15 | local_path: '' 16 | dataset_format: alpaca 17 | multi_turn: False 18 | 19 | belle_dialog: 20 | hf_hub_url: BelleGroup/generated_chat_0.4M 21 | local_path: '' 22 | dataset_format: belle_dialog 23 | multi_turn: False 24 | 25 | belle_math: 26 | hf_hub_url: BelleGroup/school_math_0.25M 27 | local_path: '' 28 | dataset_format: alpaca 29 | multi_turn: False 30 | 31 | belle_multiturn: 32 | hf_hub_url: BelleGroup/multi_turn_0.5M 33 | local_path: '' 34 | dataset_format: belle_multiturn 35 | multi_turn: True 36 | columns: 37 | prompt: instruction 38 | query: '' 39 | response: output 40 | history: history 41 | -------------------------------------------------------------------------------- /source/arch/self_knowledge/sk.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import re 4 | import string 5 | 6 | import os 7 | import argparse 8 | import csv 9 | import json 10 | import logging 11 | import pickle 12 | import time 13 | import glob 14 | 15 | import numpy as np 16 | import torch 17 | import transformers 18 | 19 | class Self_Knowledge_Model(): 20 | def __init__(self, model, tokenizer): 21 | self.model = model 22 | self.tokenizer = tokenizer 23 | 24 | def find_known(self, context, query): 25 | inputs = tokenizer(context + query, return_tensors="pt").to('cuda') 26 | generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature) 27 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1] 28 | result = tokenizer.decode(generate_ids) 29 | if result == "know": 30 | return True 31 | elif result == "unknow": 32 | return False 33 | else: 34 | print(f"Invalid output on SKM query: {context + query}") 35 | return False 36 | -------------------------------------------------------------------------------- /source/arch/passage_relevance/pr.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import re 4 | import string 5 | 6 | import os 7 | import argparse 8 | import csv 9 | import json 10 | import logging 11 | import pickle 12 | import time 13 | import glob 14 | 15 | import numpy as np 16 | import torch 17 | import transformers 18 | 19 | class Passage_Relevance_Model(): 20 | def __init__(self, model, tokenizer): 21 | self.model = model 22 | self.tokenizer = tokenizer 23 | 24 | def find_relevance(self, context, query, passage): 25 | inputs = tokenizer(context + query + "\nPassage: " + passage, return_tensors="pt").to('cuda') 26 | generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature) 27 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1] 28 | result = tokenizer.decode(generate_ids) 29 | if result == "relevance": 30 | return True 31 | elif result == "irrelevance": 32 | return False 33 | else: 34 | print(f"Invalid output on PRM query: {context + query}") 35 | return False 36 | -------------------------------------------------------------------------------- /source/arch/task_decomposition/td.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import re 4 | import string 5 | 6 | import os 7 | import argparse 8 | import csv 9 | import json 10 | import logging 11 | import pickle 12 | import time 13 | import glob 14 | 15 | import numpy as np 16 | import torch 17 | import transformers 18 | 19 | class Task_Decomposition_Model(): 20 | def __init__(self, model, tokenizer): 21 | self.model = model 22 | self.tokenizer = tokenizer 23 | self.query_list = list() 24 | 25 | def decompose(self, context, query): 26 | inputs = tokenizer(context + query, return_tensors="pt").to('cuda') 27 | generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature) 28 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1] 29 | result = tokenizer.decode(generate_ids) 30 | try: 31 | data = json.loads(result) 32 | for idx, q in data['query']: 33 | self.query_list.append(q) 34 | except json.JSONDecodeError: 35 | print(f"Invalid format on TDM query: {context + query}, json_string: {result}") 36 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/test_qlora_finetune.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python train_qlora.py \ 2 | --model_name_or_path facebook/opt-125m \ 3 | --dataset_name olcc \ 4 | --output_dir ./work_dir/run_test \ 5 | --num_train_epochs 3 \ 6 | --max_train_samples 100 \ 7 | --max_eval_samples 100 \ 8 | --per_device_train_batch_size 4 \ 9 | --per_device_eval_batch_size 4 \ 10 | --gradient_accumulation_steps 8 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 50 \ 13 | --save_strategy steps \ 14 | --save_total_limit 5 \ 15 | --save_steps 100 \ 16 | --logging_strategy steps \ 17 | --logging_steps 1 \ 18 | --learning_rate 0.0002 \ 19 | --warmup_ratio 0.03 \ 20 | --weight_decay 0.0 \ 21 | --lr_scheduler_type constant \ 22 | --adam_beta2 0.999 \ 23 | --max_grad_norm 0.3 \ 24 | --max_new_tokens 32 \ 25 | --lora_r 64 \ 26 | --lora_alpha 16 \ 27 | --lora_dropout 0.1 \ 28 | --double_quant \ 29 | --quant_type nf4 \ 30 | --fp16 \ 31 | --bits 4 \ 32 | --gradient_checkpointing \ 33 | --trust_remote_code \ 34 | --do_train \ 35 | --do_eval \ 36 | --sample_generate \ 37 | --data_seed 42 \ 38 | --seed 0 39 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/qlora_finetune/finetune_baichuan_7b_vicuna_zh.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=1 python train_qlora.py \ 2 | --model_name_or_path ~/checkpoints/baichuan7b \ 3 | --dataset_cfg ./data/vicuna_zh_pcyn.yaml \ 4 | --output_dir ./work_dir/vicuna_zh-baichuan-7b \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 2 \ 7 | --per_device_eval_batch_size 2 \ 8 | --gradient_accumulation_steps 16 \ 9 | --evaluation_strategy steps \ 10 | --eval_steps 1000 \ 11 | --save_strategy steps \ 12 | --save_total_limit 10 \ 13 | --save_steps 1000 \ 14 | --logging_strategy steps \ 15 | --logging_steps 5 \ 16 | --learning_rate 0.0002 \ 17 | --warmup_ratio 0.03 \ 18 | --weight_decay 0.0 \ 19 | --lr_scheduler_type constant \ 20 | --adam_beta2 0.999 \ 21 | --max_grad_norm 0.3 \ 22 | --lora_r 64 \ 23 | --lora_alpha 16 \ 24 | --lora_dropout 0.1 \ 25 | --double_quant \ 26 | --quant_type nf4 \ 27 | --fp16 \ 28 | --bits 4 \ 29 | --model_max_length 1024 \ 30 | --gradient_checkpointing \ 31 | --trust_remote_code True \ 32 | --use_auth_token True \ 33 | --do_train \ 34 | --do_eval \ 35 | --data_seed 42 \ 36 | --seed 0 37 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/qlora_finetune/finetune_llama2_7b_alpaca_zh.sh: -------------------------------------------------------------------------------- 1 | python train_qlora.py \ 2 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 3 | --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ 4 | --output_dir ./work_dir/alpaca_zh_llama2-7b \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy steps \ 10 | --eval_steps 1000 \ 11 | --save_strategy steps \ 12 | --save_total_limit 10 \ 13 | --save_steps 1000 \ 14 | --logging_strategy steps \ 15 | --logging_steps 5 \ 16 | --learning_rate 0.0002 \ 17 | --warmup_ratio 0.03 \ 18 | --weight_decay 0.0 \ 19 | --lr_scheduler_type constant \ 20 | --adam_beta2 0.999 \ 21 | --max_grad_norm 0.3 \ 22 | --lora_r 64 \ 23 | --lora_alpha 16 \ 24 | --lora_dropout 0.1 \ 25 | --double_quant \ 26 | --quant_type nf4 \ 27 | --fp16 \ 28 | --bits 4 \ 29 | --model_max_length 1024 \ 30 | --gradient_checkpointing \ 31 | --trust_remote_code True \ 32 | --use_auth_token True \ 33 | --do_train \ 34 | --do_eval \ 35 | --sample_generate \ 36 | --data_seed 42 \ 37 | --seed 0 38 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/qlora_finetune/finetune_llama_7b_alpaca_zh.sh: -------------------------------------------------------------------------------- 1 | python train_qlora.py \ 2 | --model_name_or_path decapoda-research/llama-7b-hf \ 3 | --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ 4 | --output_dir ./work_dir/alpaca_zh-baichuan-7b \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy steps \ 10 | --eval_steps 1000 \ 11 | --save_strategy steps \ 12 | --save_total_limit 10 \ 13 | --save_steps 1000 \ 14 | --logging_strategy steps \ 15 | --logging_steps 5 \ 16 | --learning_rate 0.0002 \ 17 | --warmup_ratio 0.03 \ 18 | --weight_decay 0.0 \ 19 | --lr_scheduler_type constant \ 20 | --adam_beta2 0.999 \ 21 | --max_grad_norm 0.3 \ 22 | --lora_r 64 \ 23 | --lora_alpha 16 \ 24 | --lora_dropout 0.1 \ 25 | --double_quant \ 26 | --quant_type nf4 \ 27 | --fp16 \ 28 | --bits 4 \ 29 | --model_max_length 1024 \ 30 | --gradient_checkpointing \ 31 | --trust_remote_code True \ 32 | --use_auth_token True \ 33 | --do_train \ 34 | --do_eval \ 35 | --sample_generate \ 36 | --data_seed 42 \ 37 | --seed 0 38 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/quant_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | 4 | @dataclass 5 | class QuantArguments: 6 | # 使用8-bit的adam,是否可以调整为LION或Sophia,甚至deepspeed还提供了多个1-bit优化器选择 7 | adam8bit: bool = field(default=False, metadata={'help': 'Use 8-bit adam.'}) 8 | # 是否使用二次量化 9 | double_quant: bool = field( 10 | default=True, 11 | metadata={ 12 | 'help': 13 | 'Compress the quantization statistics through double quantization.' 14 | }) 15 | # 量化类型,可以选择`fp4`或`nf4` 16 | quant_type: str = field( 17 | default='nf4', 18 | metadata={ 19 | 'help': 20 | 'Quantization data type to use. Should be one of `fp4` or `nf4`.' 21 | }) 22 | # 使用的位宽,默认为4。 23 | bits: int = field(default=4, metadata={'help': 'How many bits to use.'}) 24 | 25 | def __post_init__(self): 26 | if self.bits is not None: 27 | assert self.bits in [ 28 | 4, 8 29 | ], 'We only accept 4-bit or 8-bit quantization.' 30 | 31 | if self.quant_type is not None: 32 | assert self.quant_type in [ 33 | 'nf4', 'fp4' 34 | ], 'We only accept `nf4` or `fp4` quantization type.' 35 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/qlora_finetune/finetune_baichuan_7b_alpaca_zh.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python train_qlora.py \ 2 | --model_name_or_path ~/checkpoints/baichuan7b \ 3 | --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ 4 | --output_dir ./work_dir/alpaca_zh-baichuan-7b \ 5 | --num_train_epochs 3 \ 6 | --per_device_train_batch_size 4 \ 7 | --per_device_eval_batch_size 4 \ 8 | --gradient_accumulation_steps 8 \ 9 | --evaluation_strategy steps \ 10 | --eval_steps 1000 \ 11 | --save_strategy steps \ 12 | --save_total_limit 10 \ 13 | --save_steps 1000 \ 14 | --logging_strategy steps \ 15 | --logging_steps 5 \ 16 | --learning_rate 0.0002 \ 17 | --warmup_ratio 0.03 \ 18 | --weight_decay 0.0 \ 19 | --lr_scheduler_type constant \ 20 | --adam_beta2 0.999 \ 21 | --max_grad_norm 0.3 \ 22 | --lora_r 64 \ 23 | --lora_alpha 16 \ 24 | --lora_dropout 0.1 \ 25 | --double_quant \ 26 | --quant_type nf4 \ 27 | --fp16 \ 28 | --bits 4 \ 29 | --model_max_length 1024 \ 30 | --gradient_checkpointing \ 31 | --trust_remote_code True \ 32 | --use_auth_token True \ 33 | --do_train \ 34 | --do_eval \ 35 | --sample_generate \ 36 | --data_seed 42 \ 37 | --seed 0 38 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer 3 | from retrieval_contriever.src.contriever import Contriever 4 | 5 | tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/contriever-msmarco') 6 | model = Contriever.from_pretrained('/root/autodl-tmp/contriever-msmarco') 7 | 8 | sentences = [ 9 | "Where was Marie Curie born?", 10 | "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.", 11 | "Born in Paris on 15 May 1859, 111111 Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace." 12 | ] 13 | 14 | # Apply tokenizer 15 | inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') 16 | 17 | # Compute token embeddings 18 | outputs = model(**inputs) 19 | 20 | # Mean pooling 21 | def mean_pooling(token_embeddings, mask): 22 | token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.) 23 | sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None] 24 | return sentence_embeddings 25 | # embeddings = mean_pooling(outputs[0], inputs['attention_mask']) 26 | embeddings = outputs 27 | # print(embeddings[0]) 28 | # print(embeddings[1]) 29 | score1 = embeddings[0] @ embeddings[1] 30 | score2 = embeddings[0] @ embeddings[2] 31 | print(score1) 32 | print(score2) -------------------------------------------------------------------------------- /source/model/llama2/examples/format_data/merge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Merge two conversation files into one 3 | 4 | Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json 5 | """ 6 | 7 | import argparse 8 | import json 9 | 10 | from datasets import load_dataset 11 | 12 | 13 | def json_load(in_file): 14 | with open(in_file, 'r') as f: 15 | json_data = json.load(f) 16 | return json_data 17 | 18 | 19 | def json_dump(obj, path): 20 | with open(path, 'w', encoding='utf-8') as f: 21 | json.dump(obj, f, indent=2, ensure_ascii=False) 22 | 23 | 24 | def merge_datasets(in_file_list, out_file): 25 | 26 | new_content = [] 27 | for in_file in in_file_list: 28 | content = load_dataset('json', data_files=in_file)['train'] 29 | 30 | print(f'in-file: {in_file}, len: {len(content)}') 31 | new_content.extend(content) 32 | 33 | print(f'#out: {len(new_content)}') 34 | print(f'Save new_content to {out_file}') 35 | json_dump(new_content, out_file) 36 | 37 | 38 | if __name__ == '__main__': 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('--in-file', type=str, required=True, nargs='+') 41 | parser.add_argument('--out-file', type=str, default='merged.json') 42 | args = parser.parse_args() 43 | 44 | merge_datasets(args.in_file, args.out_file) 45 | -------------------------------------------------------------------------------- /source/model/llama2/data/vicuna_zh.yaml: -------------------------------------------------------------------------------- 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments. 2 | coig: 3 | hf_hub_url: BAAI/COIG 4 | local_path: /home/robin/prompt_data/COIG/train_vicuna.json 5 | dataset_format: sharegpt 6 | multi_turn: True 7 | 8 | cvalues_comparison_train: 9 | hf_hub_url: '' 10 | local_path: /home/robin/prompt_data/CValues-Comparison/train_vicuna.json 11 | dataset_format: sharegpt 12 | multi_turn: True 13 | 14 | cvalues_comparison_test: 15 | hf_hub_url: '' 16 | local_path: /home/robin/prompt_data/CValues-Comparison/test_vicuna.json 17 | dataset_format: sharegpt 18 | multi_turn: True 19 | 20 | olcc: 21 | hf_hub_url: '' 22 | local_path: /home/robin/prompt_data/olcc/olcc_vicuna.json 23 | dataset_format: sharegpt 24 | multi_turn: True 25 | 26 | 100PoisonMpts: 27 | hf_hub_url: '' 28 | local_path: /home/robin/prompt_data/100PoisonMpts/train_vicuna.json 29 | dataset_format: sharegpt 30 | multi_turn: True 31 | 32 | safety_prompt_part1: 33 | hf_hub_url: '' 34 | local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json 35 | dataset_format: sharegpt 36 | multi_turn: True 37 | 38 | safety_prompt_part2: 39 | hf_hub_url: '' 40 | local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json 41 | dataset_format: sharegpt 42 | multi_turn: True 43 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/model_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | 5 | @dataclass 6 | class ModelArguments: 7 | model_name_or_path: Optional[str] = field( 8 | default='facebook/opt-125m', 9 | metadata={ 10 | 'help': 11 | ("The model checkpoint for weights initialization. Don't set if you want to\ 12 | train a model from scratch.") 13 | }, 14 | ) 15 | tokenizer_name: Optional[str] = field( 16 | default=None, 17 | metadata={ 18 | 'help': 19 | 'Pretrained tokenizer name or path if not the same as model_name' 20 | }) 21 | model_revision: str = field( 22 | default='main', 23 | metadata={ 24 | 'help': 25 | 'The specific model version to use (can be a branch name, tag name or commit id).' 26 | }, 27 | ) 28 | trust_remote_code: Optional[bool] = field( 29 | default=False, 30 | metadata={ 31 | 'help': 32 | 'Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained.' 33 | }) 34 | use_auth_token: Optional[bool] = field( 35 | default=False, 36 | metadata={ 37 | 'help': 38 | 'Enables using Huggingface auth token from Git Credentials.' 39 | }) 40 | -------------------------------------------------------------------------------- /source/model/llama2/data/alpaca_zh.yaml: -------------------------------------------------------------------------------- 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments. 2 | coig: 3 | hf_hub_url: BAAI/COIG 4 | local_path: /home/robin/prompt_data/COIG/train_alpaca.json 5 | dataset_format: alpaca 6 | multi_turn: False 7 | 8 | cvalues_comparison_train: 9 | hf_hub_url: '' 10 | local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json 11 | dataset_format: alpaca 12 | multi_turn: False 13 | 14 | cvalues_comparison_test: 15 | hf_hub_url: '' 16 | local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json 17 | dataset_format: alpaca 18 | multi_turn: False 19 | 20 | olcc: 21 | hf_hub_url: '' 22 | local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json 23 | dataset_format: alpaca 24 | multi_turn: False 25 | 26 | 100PoisonMpts: 27 | hf_hub_url: 'damo/100PoisonMpts' 28 | local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json 29 | dataset_format: alpaca 30 | multi_turn: False 31 | 32 | safety_prompt_part1: 33 | hf_hub_url: '' 34 | local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json 35 | dataset_format: alpaca 36 | multi_turn: False 37 | 38 | safety_prompt_part2: 39 | hf_hub_url: '' 40 | local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json 41 | dataset_format: alpaca 42 | multi_turn: False 43 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/ds_config/default_offload_opt_param.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupDecayLR", 16 | "params": { 17 | "total_num_steps": "auto", 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | "zero_optimization": { 24 | "stage": 3, 25 | "offload_optimizer": { 26 | "device": "cpu", 27 | "pin_memory": true 28 | }, 29 | "offload_param": { 30 | "device": "cpu", 31 | "pin_memory": true 32 | }, 33 | "overlap_comm": true, 34 | "contiguous_gradients": true, 35 | "sub_group_size": 1e9, 36 | "reduce_bucket_size": "auto", 37 | "stage3_prefetch_bucket_size": "auto", 38 | "stage3_param_persistence_threshold": "auto", 39 | "stage3_max_live_parameters": 1e9, 40 | "stage3_max_reuse_distance": 1e9, 41 | "stage3_gather_16bit_weights_on_model_save": false 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "steps_per_print": 5, 46 | "train_batch_size": "auto", 47 | "train_micro_batch_size_per_gpu": "auto", 48 | "wall_clock_breakdown": false 49 | } 50 | -------------------------------------------------------------------------------- /source/model/llama2/data/alpaca_zh_pcyn.yaml: -------------------------------------------------------------------------------- 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments. 2 | coig: 3 | hf_hub_url: BAAI/COIG 4 | local_path: /userhome/jianzhnie/prompt_data/COIG/train_alpaca.json 5 | dataset_format: alpaca 6 | multi_turn: False 7 | 8 | cvalues_comparison_train: 9 | hf_hub_url: '' 10 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_alpaca.json 11 | dataset_format: alpaca 12 | multi_turn: False 13 | 14 | cvalues_comparison_test: 15 | hf_hub_url: '' 16 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_alpaca.json 17 | dataset_format: alpaca 18 | multi_turn: False 19 | 20 | olcc: 21 | hf_hub_url: '' 22 | local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_alpaca.json 23 | dataset_format: alpaca 24 | multi_turn: False 25 | 26 | 100PoisonMpts: 27 | hf_hub_url: '' 28 | local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_alpaca.json 29 | dataset_format: alpaca 30 | multi_turn: False 31 | 32 | safety_prompt_part1: 33 | hf_hub_url: '' 34 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json 35 | dataset_format: alpaca 36 | multi_turn: False 37 | 38 | safety_prompt_part2: 39 | hf_hub_url: '' 40 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json 41 | dataset_format: alpaca 42 | multi_turn: False 43 | -------------------------------------------------------------------------------- /source/model/llama2/data/vicuna_zh_pcyn.yaml: -------------------------------------------------------------------------------- 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments. 2 | coig: 3 | hf_hub_url: BAAI/COIG 4 | local_path: /userhome/jianzhnie/prompt_data/COIG/train_vicuna.json 5 | dataset_format: sharegpt 6 | multi_turn: True 7 | 8 | cvalues_comparison_train: 9 | hf_hub_url: '' 10 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_vicuna.json 11 | dataset_format: sharegpt 12 | multi_turn: True 13 | 14 | cvalues_comparison_test: 15 | hf_hub_url: '' 16 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_vicuna.json 17 | dataset_format: sharegpt 18 | multi_turn: True 19 | 20 | olcc: 21 | hf_hub_url: '' 22 | local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_vicuna.json 23 | dataset_format: sharegpt 24 | multi_turn: True 25 | 26 | 100PoisonMpts: 27 | hf_hub_url: '' 28 | local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_vicuna.json 29 | dataset_format: sharegpt 30 | multi_turn: True 31 | 32 | safety_prompt_part1: 33 | hf_hub_url: '' 34 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json 35 | dataset_format: sharegpt 36 | multi_turn: True 37 | 38 | safety_prompt_part2: 39 | hf_hub_url: '' 40 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json 41 | dataset_format: sharegpt 42 | multi_turn: True 43 | -------------------------------------------------------------------------------- /retrieval_contriever/example_scripts/contriever.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --cpus-per-task=5 3 | #SBATCH --nodes=4 4 | #SBATCH --ntasks-per-node=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --time=72:00:00 7 | #SBATCH --job-name=contriever 8 | #SBATCH --output=/private/home/gizacard/contriever/logtrain/%A 9 | #SBATCH --partition=learnlab 10 | #SBATCH --mem=450GB 11 | #SBATCH --signal=USR1@140 12 | #SBATCH --open-mode=append 13 | 14 | 15 | port=$(shuf -i 15000-16000 -n 1) 16 | TDIR="/private/home/gizacard/contriever/encoded-data" 17 | TRAINDATASETS="${TDIR}/wikisub/ ${TDIR}/cc-netsub/" 18 | 19 | rmin=0.05 20 | rmax=0.5 21 | T=0.05 22 | QSIZE=131072 23 | MOM=0.9995 24 | POOL=average 25 | AUG=delete 26 | PAUG=0.1 27 | LC=0. 28 | mo=bert-base-uncased 29 | mp=none 30 | 31 | name=$SLURM_JOB_ID-$POOL-rmin$rmin-rmax$rmax-T$T-$QSIZE-$MOM-$mo-$AUG-$PAUG 32 | 33 | srun ~oceanntwt/anaconda3/envs/contriever/bin/python3 train.py \ 34 | --model_path $mp \ 35 | --sampling_coefficient $LC \ 36 | --retriever_model_id $mo --pooling $POOL \ 37 | --augmentation $AUG --prob_augmentation $PAUG \ 38 | --train_data $TRAINDATASETS --loading_mode split \ 39 | --ratio_min $rmin --ratio_max $rmax --chunk_length 256 \ 40 | --momentum $MOM --queue_size $QSIZE --temperature $T \ 41 | --warmup_steps 20000 --total_steps 500000 --lr 0.00005 \ 42 | --name $name \ 43 | --scheduler linear \ 44 | --optim adamw \ 45 | --per_gpu_batch_size 64 \ 46 | --output_dir /checkpoint/oceanntwt/contriever/$name \ 47 | --main_port $port \ 48 | 49 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/ds_config/ds_config_zero3_auto.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupDecayLR", 24 | "params": { 25 | "total_num_steps": "auto", 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto" 29 | } 30 | }, 31 | "zero_optimization": { 32 | "stage": 3, 33 | "offload_optimizer": { 34 | "device": "cpu", 35 | "pin_memory": true 36 | }, 37 | "offload_param": { 38 | "device": "cpu", 39 | "pin_memory": true 40 | }, 41 | "overlap_comm": true, 42 | "contiguous_gradients": true, 43 | "allgather_partitions": true, 44 | "allgather_bucket_size": 5e8, 45 | "sub_group_size": 1e9, 46 | "reduce_bucket_size": "auto", 47 | "stage3_prefetch_bucket_size": "auto", 48 | "stage3_param_persistence_threshold": "auto", 49 | "stage3_max_live_parameters": 1e9, 50 | "stage3_max_reuse_distance": 1e9, 51 | "stage3_gather_16bit_weights_on_model_save": true 52 | }, 53 | "train_batch_size": "auto", 54 | "train_micro_batch_size_per_gpu": "auto", 55 | "gradient_accumulation_steps": "auto", 56 | "gradient_clipping": "auto", 57 | "steps_per_print": 5, 58 | "wall_clock_breakdown": false 59 | } 60 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/infer_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | 5 | @dataclass 6 | class ModelInferenceArguments: 7 | cache_dir: Optional[str] = field(default=None) 8 | model_name_or_path: Optional[str] = field( 9 | default='facebook/opt-125m', 10 | metadata={'help': 'Path to pre-trained model'}) 11 | model_revision: str = field( 12 | default='main', 13 | metadata={ 14 | 'help': 15 | 'The specific model version to use (can be a branch name, tag name or commit id).' 16 | }, 17 | ) 18 | trust_remote_code: Optional[bool] = field( 19 | default=False, 20 | metadata={ 21 | 'help': 22 | 'Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained.' 23 | }) 24 | use_auth_token: Optional[bool] = field( 25 | default=False, 26 | metadata={ 27 | 'help': 28 | 'Enables using Huggingface auth token from Git Credentials.' 29 | }) 30 | model_max_length: int = field( 31 | default=2048, 32 | metadata={ 33 | 'help': 34 | 'Maximum sequence length. Sequences will be right padded (and possibly truncated).' 35 | }, 36 | ) 37 | low_cpu_mem_usage: bool = field( 38 | default=True, 39 | metadata={'help': 'Whether to use low cpu memory usage mode.'}) 40 | fp16: bool = field(default=False, 41 | metadata={'help': 'Whether to use fp16.'}) 42 | prompt_template: str = field( 43 | default='default', 44 | metadata={ 45 | 'help': 46 | 'Prompt template name. Such as vanilla, alpaca, llama2, vicuna..., etc.' 47 | }) 48 | source_prefix: Optional[str] = field( 49 | default=None, 50 | metadata={'help': 'Prefix to prepend to every source text.'}) 51 | -------------------------------------------------------------------------------- /retrieval_contriever/example_scripts/mcontriever.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --cpus-per-task=5 3 | #SBATCH --nodes=8 4 | #SBATCH --ntasks-per-node=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --time=72:00:00 7 | #SBATCH --job-name=mcontriever 8 | #SBATCH --output=/private/home/oceanntwt/contriever/logtrain/%A 9 | #SBATCH --partition=learnlab 10 | #SBATCH --mem=450GB 11 | #SBATCH --signal=USR1@140 12 | #SBATCH --open-mode=append 13 | 14 | 15 | port=$(shuf -i 15000-16000 -n 1) 16 | 17 | TDIR=/private/home/oceanntwt/contriever/encoded-data/bert-base-multilingual-cased/ 18 | TRAINDATASETS="${TDIR}fr_XX ${TDIR}en_XX ${TDIR}ar_AR ${TDIR}bn_IN ${TDIR}fi_FI ${TDIR}id_ID ${TDIR}ja_XX ${TDIR}ko_KR ${TDIR}ru_RU ${TDIR}sw_KE ${TDIR}hu_HU ${TDIR}he_IL ${TDIR}it_IT ${TDIR}km_KM ${TDIR}ms_MY ${TDIR}nl_XX ${TDIR}no_XX ${TDIR}pl_PL ${TDIR}pt_XX ${TDIR}sv_SE ${TDIR}te_IN ${TDIR}th_TH ${TDIR}tr_TR ${TDIR}vi_VN ${TDIR}zh_CN ${TDIR}zh_TW ${TDIR}es_XX ${TDIR}de_DE ${TDIR}da_DK" 19 | 20 | rmin=0.1 21 | rmax=0.5 22 | T=0.05 23 | QSIZE=32768 24 | MOM=0.999 25 | POOL=average 26 | AUG=none 27 | PAUG=0. 28 | LC=0. 29 | mo=bert-base-multilingual-cased 30 | mp=none 31 | 32 | name=$SLURM_JOB_ID-$POOL-rmin$rmin-rmax$rmax-T$T-$QSIZE-$MOM-$mo-$AUG-$PAUG 33 | 34 | srun ~oceanntwt/anaconda3/envs/pytorch10/bin/python3 ~oceanntwt/contriever/train.py \ 35 | --model_path $mp \ 36 | --sampling_coefficient $LC \ 37 | --augmentation $AUG --prob_augmentation $PAUG \ 38 | --retriever_model_id $mo --pooling $POOL \ 39 | --train_data $TRAINDATASETS --loading_mode split \ 40 | --ratio_min $rmin --ratio_max $rmax --chunk_length 256 \ 41 | --momentum $MOM --queue_size $QSIZE --temperature $T \ 42 | --warmup_steps 20000 --total_steps 500000 --lr 0.00005 \ 43 | --name $name \ 44 | --scheduler linear \ 45 | --optim adamw \ 46 | --per_gpu_batch_size 64 \ 47 | --output_dir /checkpoint/oceanntwt/contriever/xling/$name \ 48 | --main_port $port \ 49 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/utils/stream_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers to support streaming generate output. 3 | Borrowed from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/callbacks.py 4 | """ 5 | import traceback 6 | from queue import Queue 7 | from threading import Thread 8 | 9 | import transformers 10 | 11 | 12 | class Stream(transformers.StoppingCriteria): 13 | def __init__(self, callback_func=None): 14 | self.callback_func = callback_func 15 | 16 | def __call__(self, input_ids, scores) -> bool: 17 | if self.callback_func is not None: 18 | self.callback_func(input_ids[0]) 19 | return False 20 | 21 | 22 | class Iteratorize: 23 | """ 24 | Transforms a function that takes a callback 25 | into a lazy iterator (generator). 26 | """ 27 | def __init__(self, func, kwargs={}, callback=None): 28 | self.mfunc = func 29 | self.c_callback = callback 30 | self.q = Queue() 31 | self.sentinel = object() 32 | self.kwargs = kwargs 33 | self.stop_now = False 34 | 35 | def _callback(val): 36 | if self.stop_now: 37 | raise ValueError 38 | self.q.put(val) 39 | 40 | def gentask(): 41 | try: 42 | ret = self.mfunc(callback=_callback, **self.kwargs) 43 | except ValueError: 44 | pass 45 | except: 46 | traceback.print_exc() 47 | pass 48 | 49 | self.q.put(self.sentinel) 50 | if self.c_callback: 51 | self.c_callback(ret) 52 | 53 | self.thread = Thread(target=gentask) 54 | self.thread.start() 55 | 56 | def __iter__(self): 57 | return self 58 | 59 | def __next__(self): 60 | obj = self.q.get(True, None) 61 | if obj is self.sentinel: 62 | raise StopIteration 63 | else: 64 | return obj 65 | 66 | def __enter__(self): 67 | return self 68 | 69 | def __exit__(self, exc_type, exc_val, exc_tb): 70 | self.stop_now = True 71 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/data/data_loader.py: -------------------------------------------------------------------------------- 1 | from transformers.tokenization_utils import PreTrainedTokenizer 2 | 3 | from .conv_dataset import ConversationDataset, VicunaDataset 4 | from .data_utils import make_data_module 5 | from .sft_dataset import (DataCollatorForSupervisedDataset, 6 | SFTInstructionDataset) 7 | 8 | 9 | def make_supervised_data_module(tokenizer: PreTrainedTokenizer, args): 10 | train_dataset, eval_dataset, multi_turn = make_data_module(args) 11 | max_seq_length = tokenizer.model_max_length 12 | dataset_cls = (VicunaDataset if args.conversation_template == 'vicnua' else 13 | ConversationDataset) 14 | 15 | if not multi_turn: 16 | train_dataset = SFTInstructionDataset( 17 | train_dataset, 18 | tokenizer=tokenizer, 19 | max_seq_len=max_seq_length, 20 | ) if args.do_train else None 21 | 22 | eval_dataset = SFTInstructionDataset( 23 | eval_dataset, 24 | tokenizer=tokenizer, 25 | max_seq_len=max_seq_length, 26 | ) if args.do_eval else None 27 | 28 | else: 29 | train_dataset = dataset_cls( 30 | train_dataset, 31 | tokenizer=tokenizer, 32 | max_seq_length=max_seq_length, 33 | ) if args.do_train else None 34 | eval_dataset = dataset_cls( 35 | eval_dataset, 36 | tokenizer=tokenizer, 37 | max_seq_length=max_seq_length, 38 | ) if args.do_eval else None 39 | 40 | print( 41 | f'train_dataset: {type(train_dataset)}, mutlti-turn: {multi_turn}, #length: {len(train_dataset)}' 42 | ) if args.do_train else None 43 | print( 44 | f'eval_dataset: {type(eval_dataset)}, mutlti-turn: {multi_turn}, #length: {len(eval_dataset)}' 45 | ) if args.do_eval else None 46 | 47 | print('Adding data collator: ', DataCollatorForSupervisedDataset) 48 | data_collator = DataCollatorForSupervisedDataset( 49 | tokenizer=tokenizer, predict_with_generate=args.predict_with_generate) 50 | 51 | return { 52 | 'train_dataset': train_dataset, 53 | 'eval_dataset': eval_dataset, 54 | 'data_collator': data_collator 55 | } 56 | -------------------------------------------------------------------------------- /source/model/flan-t5/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | 4 | from datasets import load_dataset, concatenate_datasets, DatasetDict 5 | from transformers import AutoTokenizer 6 | from sklearn.model_selection import train_test_split 7 | from typing import List, Union 8 | 9 | 10 | def clean_text( 11 | texts: List[Union[str, None]], labels: List[Union[str, None]] 12 | ) -> pd.DataFrame: 13 | """ 14 | The News Group dataset needs to be preprocessed as it has a lot of 15 | entries with NULL text and/or NULL labels. 16 | In this function we simply filter out the NULL entries, and 17 | return a new dataframe with clean texts and labels. 18 | """ 19 | new_texts, new_labels = [], [] 20 | for text, label in zip(texts, labels): 21 | if isinstance(text, str) and isinstance(label, str): 22 | new_texts.append(text) 23 | new_labels.append(label) 24 | new_ids = [i for i in range(len(new_texts))] 25 | df = pd.DataFrame(data={"id": new_ids, "text": new_texts, "label": new_labels}) 26 | 27 | return df 28 | 29 | def get_data(tokenizer: AutoTokenizer) -> List[Union[DatasetDict, int, int]]: 30 | dataset_id = "nq_open" 31 | # Load dataset from the hub 32 | dataset = load_dataset(dataset_id) 33 | 34 | print(f"Train dataset size: {len(dataset['train'])}") 35 | print(f"Test dataset size: {len(dataset['validation'])}") # if validate 36 | 37 | tokenized_inputs = concatenate_datasets([dataset["train"]]).map( 38 | lambda x: tokenizer(x["question"], truncation=True), 39 | batched=True, 40 | remove_columns=["question", "answer"], 41 | ) 42 | 43 | max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]]) 44 | print(f"Max source length: {max_source_length}") 45 | 46 | tokenized_targets = concatenate_datasets([dataset["train"], dataset["validation"]]).map( 47 | lambda x: tokenizer(x["answer"], truncation=True), 48 | batched=True, 49 | remove_columns=["question", "answer"], 50 | ) 51 | 52 | max_target_length = max([len(x) for x in tokenized_targets["input_ids"]]) 53 | print(f"Max target length: {max_target_length}") 54 | 55 | return dataset, max_source_length, max_target_length 56 | -------------------------------------------------------------------------------- /retrieval_contriever/evaluate_retrieved_passages.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import argparse 8 | import json 9 | import logging 10 | import glob 11 | 12 | import numpy as np 13 | import torch 14 | 15 | import src.utils 16 | 17 | from src.evaluation import calculate_matches 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | def validate(data, workers_num): 22 | match_stats = calculate_matches(data, workers_num) 23 | top_k_hits = match_stats.top_k_hits 24 | 25 | #logger.info('Validation results: top k documents hits %s', top_k_hits) 26 | top_k_hits = [v / len(data) for v in top_k_hits] 27 | #logger.info('Validation results: top k documents hits accuracy %s', top_k_hits) 28 | return top_k_hits 29 | 30 | 31 | def main(opt): 32 | logger = src.utils.init_logger(opt, stdout_only=True) 33 | datapaths = glob.glob(args.data) 34 | r20, r100 = [], [] 35 | for path in datapaths: 36 | data = [] 37 | with open(path, 'r') as fin: 38 | for line in fin: 39 | data.append(json.loads(line)) 40 | #data = json.load(fin) 41 | answers = [ex['answers'] for ex in data] 42 | top_k_hits = validate(data, args.validation_workers) 43 | message = f"Evaluate results from {path}:" 44 | for k in [5, 10, 20, 100]: 45 | if k <= len(top_k_hits): 46 | recall = 100 * top_k_hits[k-1] 47 | if k == 20: 48 | r20.append(f"{recall:.1f}") 49 | if k == 100: 50 | r100.append(f"{recall:.1f}") 51 | message += f' R@{k}: {recall:.1f}' 52 | logger.info(message) 53 | print(datapaths) 54 | print('\t'.join(r20)) 55 | print('\t'.join(r100)) 56 | 57 | 58 | if __name__ == '__main__': 59 | parser = argparse.ArgumentParser() 60 | 61 | parser.add_argument('--data', required=True, type=str, default=None) 62 | parser.add_argument('--validation_workers', type=int, default=16, 63 | help="Number of parallel processes to validate results") 64 | 65 | args = parser.parse_args() 66 | main(args) 67 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_arguments(): 5 | parser = argparse.ArgumentParser() 6 | 7 | parser.add_argument( 8 | "--iteration_max_time", type=int, default=3, help="maxinum iteration in RA-iSF." 9 | ) 10 | parser.add_argument( 11 | "--temperature", type=float, default=0, help="" 12 | ) 13 | parser.add_argument( 14 | "--max_length", type=int, default=256, help="maxinum generation of base model" 15 | ) 16 | parser.add_argument( 17 | "--type_list_file", default="./src/format/entity_type_list.txt", type=str, help='file path' 18 | ) 19 | parser.add_argument( 20 | "--prompt_id", default='324', help='string' 21 | ) 22 | parser.add_argument( 23 | "--infer_num", default='5', help='string' 24 | ) 25 | parser.add_argument( 26 | "--engine", default='llama2-13b', help="llama2-7b, llama2-13b, gpt-3.5", 27 | choices=["llama2-7b", "llama2-13b", "gpt-3.5"] 28 | ) 29 | parser.add_argument( 30 | "--api_key", default="", help="gpt3.5 api key" 31 | ) 32 | parser.add_argument( 33 | "--base_model_path", default='/root/autodl-tmp/llama-7b-hf', help="your local model path" 34 | ) 35 | parser.add_argument( 36 | "--self_knowledge_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel self-knowledge path" 37 | ) 38 | parser.add_argument( 39 | "--passage_relevance_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel passage_relevance path" 40 | ) 41 | parser.add_argument( 42 | "--task_decomposition_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel task_decomposition path" 43 | ) 44 | parser.add_argument( 45 | "--data_path", default='/root/workspace/ra-isf/dataset/natural_question/nq_open.json', help="your local data path" 46 | ) 47 | parser.add_argument( 48 | "--output_path", default='/root/workspace/ra-isf/output/output.json', help="your local output file data path" 49 | ) 50 | parser.add_argument( 51 | "--test_start", default='0', help='string, number' 52 | ) 53 | parser.add_argument( 54 | "--test_end", default='full', help='string, number' 55 | ) 56 | parsed_args = parser.parse_args() 57 | return parsed_args 58 | 59 | 60 | args = parse_arguments() 61 | -------------------------------------------------------------------------------- /contriever_config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_retriever_arguments(): 4 | parser = argparse.ArgumentParser() 5 | 6 | parser.add_argument( 7 | "--data", 8 | # required=True, 9 | type=str, 10 | default=None, 11 | help=".json file containing question and answers, similar format to reader data", 12 | ) 13 | parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)") 14 | parser.add_argument("--passages_embeddings", type=str, default=None, help="Glob path to encoded passages") 15 | parser.add_argument( 16 | "--output_dir", type=str, default=None, help="Results are written to outputdir with data suffix" 17 | ) 18 | parser.add_argument("--n_docs", type=int, default=100, help="Number of documents to retrieve per questions") 19 | parser.add_argument( 20 | "--validation_workers", type=int, default=32, help="Number of parallel processes to validate results" 21 | ) 22 | parser.add_argument("--per_gpu_batch_size", type=int, default=64, help="Batch size for question encoding") 23 | parser.add_argument( 24 | "--save_or_load_index", action="store_true", help="If enabled, save index and load index if it exists" 25 | ) 26 | parser.add_argument( 27 | "--model_name_or_path", type=str, default="/root/autodl-tmp/contriever-msmarco", help="path to directory containing model weights and config file" 28 | ) 29 | parser.add_argument("--no_fp16", action="store_true", help="inference in fp32") 30 | parser.add_argument("--question_maxlength", type=int, default=512, help="Maximum number of tokens in a question") 31 | parser.add_argument( 32 | "--indexing_batch_size", type=int, default=1000000, help="Batch size of the number of passages indexed" 33 | ) 34 | parser.add_argument("--projection_size", type=int, default=768) 35 | parser.add_argument( 36 | "--n_subquantizers", 37 | type=int, 38 | default=0, 39 | help="Number of subquantizer used for vector quantization, if 0 flat index is used", 40 | ) 41 | parser.add_argument("--n_bits", type=int, default=8, help="Number of bits per subquantizer") 42 | parser.add_argument("--lang", nargs="+") 43 | parser.add_argument("--dataset", type=str, default="none") 44 | parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding") 45 | parser.add_argument("--normalize_text", action="store_true", help="normalize text") 46 | parsed_args = parser.parse_args() 47 | return parsed_args 48 | 49 | c_args = parse_retriever_arguments() -------------------------------------------------------------------------------- /retrieval_contriever/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import os 4 | import argparse 5 | import torch 6 | 7 | import transformers 8 | from src.normalize_text import normalize 9 | 10 | 11 | def save(tensor, split_path): 12 | if not os.path.exists(os.path.dirname(split_path)): 13 | os.makedirs(os.path.dirname(split_path)) 14 | with open(split_path, 'wb') as fout: 15 | torch.save(tensor, fout) 16 | 17 | def apply_tokenizer(path, tokenizer, normalize_text=False): 18 | alltokens = [] 19 | lines = [] 20 | with open(path, "r", encoding="utf-8") as fin: 21 | for k, line in enumerate(fin): 22 | if normalize_text: 23 | line = normalize(line) 24 | 25 | lines.append(line) 26 | if len(lines) > 1000000: 27 | tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids'] 28 | tokens = [torch.tensor(x, dtype=torch.int) for x in tokens] 29 | alltokens.extend(tokens) 30 | lines = [] 31 | 32 | tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids'] 33 | tokens = [torch.tensor(x, dtype=torch.int) for x in tokens] 34 | alltokens.extend(tokens) 35 | 36 | alltokens = torch.cat(alltokens) 37 | return alltokens 38 | 39 | def tokenize_file(args): 40 | filename = os.path.basename(args.datapath) 41 | savepath = os.path.join(args.outdir, f"{filename}.pkl") 42 | if os.path.exists(savepath): 43 | if args.overwrite: 44 | print(f"File {savepath} already exists, overwriting") 45 | else: 46 | print(f"File {savepath} already exists, exiting") 47 | return 48 | try: 49 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=True) 50 | except: 51 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=False) 52 | print(f"Encoding {args.datapath}...") 53 | tokens = apply_tokenizer(args.datapath, tokenizer, normalize_text=args.normalize_text) 54 | 55 | print(f"Saving at {savepath}...") 56 | save(tokens, savepath) 57 | 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 61 | parser.add_argument("--datapath", type=str) 62 | parser.add_argument("--outdir", type=str) 63 | parser.add_argument("--tokenizer", type=str) 64 | parser.add_argument("--overwrite", action="store_true") 65 | parser.add_argument("--normalize_text", action="store_true") 66 | 67 | args, _ = parser.parse_known_args() 68 | tokenize_file(args) 69 | -------------------------------------------------------------------------------- /source/model/llama2/examples/vllm/apil_chient.py: -------------------------------------------------------------------------------- 1 | """Example Python client for vllm.entrypoints.api_server""" 2 | 3 | import argparse 4 | import json 5 | from typing import Iterable, List 6 | 7 | import requests 8 | 9 | 10 | def clear_line(n: int = 1) -> None: 11 | LINE_UP = '\033[1A' 12 | LINE_CLEAR = '\x1b[2K' 13 | for _ in range(n): 14 | print(LINE_UP, end=LINE_CLEAR, flush=True) 15 | 16 | 17 | def post_http_request(prompt: str, 18 | api_url: str, 19 | n: int = 1, 20 | stream: bool = False) -> requests.Response: 21 | headers = {'User-Agent': 'Test Client'} 22 | pload = { 23 | 'prompt': prompt, 24 | 'n': n, 25 | 'use_beam_search': True, 26 | 'temperature': 0.0, 27 | 'max_tokens': 16, 28 | 'stream': stream, 29 | } 30 | response = requests.post(api_url, headers=headers, json=pload, stream=True) 31 | return response 32 | 33 | 34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: 35 | for chunk in response.iter_lines(chunk_size=8192, 36 | decode_unicode=False, 37 | delimiter=b'\0'): 38 | if chunk: 39 | data = json.loads(chunk.decode('utf-8')) 40 | output = data['text'] 41 | yield output 42 | 43 | 44 | def get_response(response: requests.Response) -> List[str]: 45 | data = json.loads(response.content) 46 | output = data['text'] 47 | return output 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('--host', type=str, default='localhost') 53 | parser.add_argument('--port', type=int, default=8000) 54 | parser.add_argument('--n', type=int, default=4) 55 | parser.add_argument('--prompt', type=str, default='San Francisco is a') 56 | parser.add_argument('--stream', action='store_true') 57 | args = parser.parse_args() 58 | prompt = args.prompt 59 | api_url = f'http://{args.host}:{args.port}/generate' 60 | n = args.n 61 | stream = args.stream 62 | 63 | print(f'Prompt: {prompt!r}\n', flush=True) 64 | response = post_http_request(prompt, api_url, n, stream) 65 | 66 | if stream: 67 | num_printed_lines = 0 68 | for h in get_streaming_response(response): 69 | clear_line(num_printed_lines) 70 | num_printed_lines = 0 71 | for i, line in enumerate(h): 72 | num_printed_lines += 1 73 | print(f'Beam candidate {i}: {line!r}', flush=True) 74 | else: 75 | output = get_response(response) 76 | for i, line in enumerate(output): 77 | print(f'Beam candidate {i}: {line!r}', flush=True) 78 | -------------------------------------------------------------------------------- /source/model/llama2/scripts/clean_data.sh: -------------------------------------------------------------------------------- 1 | # sharegpt 2 | python clean_sharegpt.py \ 3 | --in-file /userhome/jianzhnie/prompt_data/anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json \ 4 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_clean.json 5 | 6 | python split_long_conversation.py \ 7 | --in-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_clean.json \ 8 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_split.json \ 9 | --model-name-or-path /userhome/jianzhnie/checkpoints/llama7b 10 | 11 | python clean_evol_instruct.py \ 12 | --in-file /userhome/jianzhnie/prompt_data/WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json \ 13 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/evol_instruct_clean.json 14 | 15 | python merge.py \ 16 | --in-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_split.json /userhome/jianzhnie/prompt_data/sharegpt/evol_instruct_clean.json \ 17 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/evol_sharegpt_merge.json 18 | 19 | # chinese data 20 | python chatllms/data/utils/convert_alpaca.py \ 21 | --in-file ./prompt_data/chinese_data/alpaca_data_zh_51k.json \ 22 | --out-file ./prompt_data/chinese_data/alpaca_vicuna.json 23 | 24 | python chatllms/data/utils/convert_alpaca.py \ 25 | --in-file ./prompt_data/InstructionWild/instinwild_ch.json \ 26 | --out-file ./prompt_data/chinese_data/instinwild_ch_vicuna.json 27 | 28 | python chatllms/data/utils/convert_alpaca.py \ 29 | --in-file ./prompt_data/InstructionWild/instinwild_en.json \ 30 | --out-file ./prompt_data/chinese_data/instinwild_en_vicuna.json 31 | 32 | python chatllms/data/utils/convert_alpaca.py \ 33 | --in-file ./prompt_data/databricks-dolly-15k/databricks-dolly-15k.jsonl \ 34 | --out-file ./prompt_data/chinese_data/dolly-15k_vicuna.json 35 | 36 | python merge.py \ 37 | --in-file /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/alpaca_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/dolly-15k_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/instinwild_ch_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/instinwild_en_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/olcc.json\ 38 | --out-file /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/vicuna_merge.json 39 | 40 | 41 | # belle-group 42 | python chatllms/data/utils/convert_alpaca.py \ 43 | --in-file ./prompt_data/belle_group/generated_chat_0.4M/generated_chat_0.4M.json \ 44 | --out-file ./prompt_data/belle_group/generated_chat_vicuna.json 45 | 46 | 47 | python chatllms/data/utils/convert_alpaca.py \ 48 | --in-file ./prompt_data/belle_group/school_math_0.25M/school_math_0.25M.json \ 49 | --out-file ./prompt_data/belle_group/school_math_vicuna.json 50 | -------------------------------------------------------------------------------- /source/model/llama2/examples/test_convdataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | import numpy as np 5 | 6 | sys.path.append('../') 7 | from typing import Any, Dict 8 | 9 | import transformers 10 | 11 | from chatllms.data.conv_dataset import ConversationDataset, UltraChatDataset 12 | from chatllms.data.data_utils import (DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN, 13 | DEFAULT_PAD_TOKEN, DEFAULT_UNK_TOKEN) 14 | 15 | if __name__ == '__main__': 16 | # Load the raw data from the specified data_path 17 | data_path = '/home/robin/work_dir/llm/FastChat/data/dummy_conversation.json' 18 | with open(data_path, 'r') as file: 19 | raw_data = json.load(file) 20 | 21 | model_name_or_path = '/home/robin/checkpoints/baichuan7b' 22 | model_name_or_path = 'facebook/opt-125m' 23 | sources = [example['conversations'] for example in raw_data] 24 | tokenizer = transformers.AutoTokenizer.from_pretrained( 25 | model_name_or_path, 26 | model_max_length=64, 27 | padding_side='right', 28 | use_fast=False, 29 | add_special_tokens=False, 30 | tokenizer_type='llama' if 'llama' in model_name_or_path else 'gpt2', 31 | ) 32 | # Define a dictionary to store any missing special tokens along with their default values 33 | special_tokens_dict: Dict[str, Any] = {} 34 | 35 | # Check if each special token is present. If not, add it to the special_tokens_dict with its default value. 36 | if tokenizer.pad_token is None: 37 | special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN 38 | if tokenizer.eos_token is None: 39 | special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN 40 | if tokenizer.bos_token is None: 41 | special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN 42 | if tokenizer.unk_token is None: 43 | special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN 44 | 45 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) 46 | 47 | print(tokenizer.bos_token) 48 | # # Apply the conversation function to the raw data 49 | dataset = ConversationDataset(sources, tokenizer, 64) 50 | 51 | for idx, data in enumerate(dataset): 52 | print('==' * 10) 53 | input_ids = data['input_ids'] 54 | input_txt = tokenizer.decode(input_ids) 55 | print(input_txt) 56 | targets = data['labels'] 57 | input_ids = np.array(input_ids) 58 | target_text = tokenizer.decode(targets) 59 | print(target_text) 60 | if idx > 10: 61 | break 62 | 63 | dataset = UltraChatDataset(sources, tokenizer, 128) 64 | for idx, data in enumerate(dataset): 65 | input_ids = data['input_ids'] 66 | labels = data['labels'] 67 | input_txt = tokenizer.decode(input_ids) 68 | target_text = tokenizer.decode(labels) 69 | print(input_txt) 70 | print(target_text) 71 | if idx > 10: 72 | break 73 | -------------------------------------------------------------------------------- /source/model/llama2/server/multi_chat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from threading import Thread 3 | 4 | import torch 5 | import transformers 6 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 7 | TextIteratorStreamer) 8 | 9 | sys.path.append('../') 10 | from chatllms.configs import GenerationArguments, ModelInferenceArguments 11 | from chatllms.utils.model_utils import get_logits_processor 12 | 13 | 14 | def main(model_server_args, generation_args): 15 | """ 16 | 多轮对话,不具有对话历史的记忆功能 17 | """ 18 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 19 | model = AutoModelForCausalLM.from_pretrained( 20 | model_server_args.model_name_or_path, 21 | cache_dir=model_server_args.cache_dir, 22 | trust_remote_code=True, 23 | low_cpu_mem_usage=True, 24 | torch_dtype=torch.float16, 25 | device_map='auto').to(device).eval() 26 | tokenizer = AutoTokenizer.from_pretrained( 27 | model_server_args.model_name_or_path, 28 | trust_remote_code=True, 29 | use_fast=False, 30 | ) 31 | # 记录所有历史记录 32 | historys = tokenizer.bos_token 33 | print('User: ', end='', flush=True) 34 | user_input = input('') 35 | while True: 36 | user_input = '{}'.format(user_input).strip() 37 | historys = historys + user_input 38 | inputs = tokenizer(historys, 39 | return_tensors='pt', 40 | add_special_tokens=False) 41 | inputs = {k: v.to(model.device) for k, v in inputs.items()} 42 | 43 | # Create a TextIteratorStreamer object to stream the response from the model 44 | streamer = TextIteratorStreamer(tokenizer, 45 | timeout=60.0, 46 | skip_prompt=True, 47 | skip_special_tokens=True) 48 | 49 | # Set the arguments for the model's generate() method 50 | gen_kwargs = dict( 51 | inputs, 52 | streamer=streamer, 53 | logits_processor=get_logits_processor(), 54 | **generation_args.to_dict(), 55 | ) 56 | 57 | # Start a separate thread to generate the response asynchronously 58 | thread = Thread(target=model.generate, kwargs=gen_kwargs) 59 | thread.start() 60 | 61 | # Print the model name and the response as it is generated 62 | print('Assistant: ', end='', flush=True) 63 | response = '' 64 | for new_text in streamer: 65 | print(new_text, end='', flush=True) 66 | response += new_text 67 | 68 | historys = historys + response 69 | print('\n') 70 | print('User: ', end='', flush=True) 71 | user_input = input('') 72 | 73 | 74 | if __name__ == '__main__': 75 | parser = transformers.HfArgumentParser( 76 | (ModelInferenceArguments, GenerationArguments)) 77 | model_server_args, generation_args = parser.parse_args_into_dataclasses() 78 | main(model_server_args, generation_args) 79 | -------------------------------------------------------------------------------- /source/model/llama2/examples/format_data/convert_oasst1.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import random 5 | 6 | 7 | def json_dump(obj, path): 8 | with open(path, 'w', encoding='utf-8') as f: 9 | json.dump(obj, f, indent=2, ensure_ascii=False) 10 | 11 | 12 | def json_load(in_file): 13 | with open(in_file, 'r') as f: 14 | json_data = json.load(f) 15 | return json_data 16 | 17 | 18 | def convert_oasst1_data(data_dir, output_dir): 19 | ''' 20 | For OASST1, because it's in a tree structure, where every user input might get multiple replies, 21 | we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node). 22 | This results in some of the messages being duplicated among different paths (instances). 23 | Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path. 24 | ''' 25 | conversations = [] 26 | with open(os.path.join(data_dir, '2023-04-12_oasst_ready.trees.jsonl'), 27 | 'r') as fin: 28 | for line in fin: 29 | conversations.append(json.loads(line)) 30 | 31 | output_path = os.path.join(output_dir, 'oasst1_data.jsonl') 32 | 33 | # tranvers the conversation tree, and collect all valid sequences 34 | def dfs(reply, messages, valid_sequences): 35 | if reply['role'] == 'assistant': 36 | messages.append({'role': 'assistant', 'content': reply['text']}) 37 | valid_sequences.append(messages[:]) 38 | for child in reply['replies']: 39 | dfs(child, messages, valid_sequences) 40 | messages.pop() 41 | elif reply['role'] == 'prompter': 42 | messages.append({'role': 'user', 'content': reply['text']}) 43 | for child in reply['replies']: 44 | dfs(child, messages, valid_sequences) 45 | messages.pop() 46 | else: 47 | raise ValueError(f"Unknown role: {reply['role']}") 48 | 49 | with open(output_path, 'w') as fout: 50 | example_cnt = 0 51 | for _, conversation in enumerate(conversations): 52 | valid_sequences = [] 53 | dfs(conversation['prompt'], [], valid_sequences) 54 | for sequence in valid_sequences: 55 | fout.write( 56 | json.dumps({ 57 | 'dataset': 'oasst1', 58 | 'id': f'oasst1_{example_cnt}', 59 | 'messages': sequence 60 | }) + '\n') 61 | example_cnt += 1 62 | 63 | 64 | if __name__ == '__main__': 65 | arg_parser = argparse.ArgumentParser() 66 | arg_parser.add_argument('--raw_data_dir', 67 | type=str, 68 | default='data/downloads') 69 | arg_parser.add_argument('--output_dir', type=str, default='data/processed') 70 | arg_parser.add_argument('--seed', type=int, default=42) 71 | args = arg_parser.parse_args() 72 | random.seed(args.seed) 73 | 74 | convert_oasst1_data(data_dir=args.raw_data_dir, output_dir=args.output_dir) 75 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/train_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from transformers import TrainingArguments 5 | 6 | 7 | @dataclass 8 | class TrainingArguments(TrainingArguments): 9 | # 缓存目录 10 | cache_dir: Optional[str] = field(default=None) 11 | # 不使用adapter进行全微调(不适用Lora或qlora?) 12 | full_finetune: bool = field( 13 | default=False, 14 | metadata={'help': 'Finetune the entire model without adapters.'}) 15 | # 是否进行训练,那肯定是要的 16 | do_train: bool = field( 17 | default=True, 18 | metadata={'help': 'To train or not to train, that is the question?'}) 19 | # 是否进行验证 20 | do_eval: bool = field( 21 | default=False, 22 | metadata={'help': 'To train or not to train, that is the question?'}) 23 | # 是否使用MMLU评估 24 | do_mmlu_eval: Optional[bool] = field( 25 | default=False, 26 | metadata={'help': 'Whether to run the MMLU evaluation.'}) 27 | # mmlu数据集的默认名称,`mmlu-zs` for zero-shot or `mmlu-fs` for few shot. 28 | mmlu_dataset: Optional[str] = field( 29 | default='mmlu-fs', 30 | metadata={ 31 | 'help': 32 | 'MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot.' 33 | }) 34 | # mmlu数据集的默认分割,`eval` for evaluation or `test` for testing. 35 | mmlu_split: Optional[str] = field( 36 | default='eval', metadata={'help': 'The MMLU split to run on'}) 37 | # mmlu数据集的默认最大样本数量 38 | max_mmlu_samples: Optional[int] = field( 39 | default=None, 40 | metadata={ 41 | 'help': 42 | 'If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset.' 43 | }) 44 | # mmlu数据集source文本的最大长度(是字符长度还是token长度,这个去代码中找线索吧) 45 | mmlu_source_max_len: int = field( 46 | default=2048, 47 | metadata={'help': 'Maximum source sequence length for mmlu.'}) 48 | # 是否进行sample generation 49 | sample_generate: bool = field( 50 | default=False, 51 | metadata={'help': 'If do sample generation on evaluation.'}) 52 | # 使用nvidia的分页机制优化器,可以在偶尔OOM的情况,让模型继续训练下去。 53 | optim: str = field(default='paged_adamw_32bit', 54 | metadata={'help': 'The optimizer to be used'}) 55 | # 梯度截断因子 56 | max_grad_norm: float = field( 57 | default=0.3, 58 | metadata={ 59 | 'help': 60 | 'Gradient clipping max norm. This is tuned and works well for all models tested.' 61 | }) 62 | # 梯度检查,设置为True,来减少显存占用。 63 | # 显存这么紧张,肯定是要设置为 True,但是运行时间就会提升 64 | gradient_checkpointing: bool = field( 65 | default=True, 66 | metadata={'help': 'Use gradient checkpointing. You want to use this.'}) 67 | predict_with_generate: bool = field( 68 | default=False, 69 | metadata={ 70 | 'help': 71 | 'Group sequences into batches with same length. Saves memory and speeds up training considerably.' 72 | }) 73 | model_max_length: int = field( 74 | default=1024, 75 | metadata={ 76 | 'help': 77 | 'Maximum sequence length. Sequences will be right padded (and possibly truncated).' 78 | }, 79 | ) 80 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/data/utils/convert_alpaca.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert alpaca dataset into sharegpt format. 3 | 4 | Usage: python3 -m chatllms.data.convert_alpaca --in alpaca_data.json 5 | """ 6 | 7 | import argparse 8 | import json 9 | from typing import Any, Dict, List 10 | 11 | from datasets import load_dataset 12 | 13 | 14 | def json_dump(obj, path): 15 | with open(path, 'w', encoding='utf-8') as f: 16 | json.dump(obj, f, indent=2, ensure_ascii=False) 17 | 18 | 19 | def json_load(in_file): 20 | with open(in_file, 'r') as f: 21 | json_data = json.load(f) 22 | return json_data 23 | 24 | 25 | def valid_keys(keys): 26 | for k in ['instruction', 'input', 'output']: 27 | if k not in keys: 28 | return False 29 | return True 30 | 31 | 32 | def convert_alpaca_vicuna(raw_data: List[Dict[str, Any]]): 33 | collect_data = [] 34 | for i, content in enumerate(raw_data): 35 | if not valid_keys(content.keys()): 36 | continue 37 | 38 | if len(content['input'].strip()) > 1: 39 | q, a = content['instruction'] + '\nInput:\n' + content[ 40 | 'input'], content['output'] 41 | else: 42 | q, a = content['instruction'], content['output'] 43 | 44 | collect_data.append({ 45 | 'id': 46 | f'alpaca_{i}', 47 | 'conversations': [ 48 | { 49 | 'from': 'human', 50 | 'value': q 51 | }, 52 | { 53 | 'from': 'gpt', 54 | 'value': a 55 | }, 56 | ], 57 | }) 58 | print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}') 59 | return collect_data 60 | 61 | 62 | def convert_dolly_vicuna(raw_data: List[Dict[str, Any]]): 63 | collect_data = [] 64 | for i, content in enumerate(raw_data): 65 | if len(content['context'].strip()) > 1: 66 | q, a = content['instruction'] + '\nInput:\n' + content[ 67 | 'context'], content['response'] 68 | else: 69 | q, a = content['instruction'], content['response'] 70 | 71 | collect_data.append({ 72 | 'id': 73 | f'alpaca_{i}', 74 | 'conversations': [ 75 | { 76 | 'from': 'human', 77 | 'value': q 78 | }, 79 | { 80 | 'from': 'gpt', 81 | 'value': a 82 | }, 83 | ], 84 | }) 85 | print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}') 86 | return collect_data 87 | 88 | 89 | def main(): 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--in-file', type=str) 92 | parser.add_argument('--out-file', type=str) 93 | args = parser.parse_args() 94 | 95 | raw_data = load_dataset('json', data_files=args.in_file)['train'] 96 | new_data = convert_alpaca_vicuna(raw_data) 97 | 98 | # new_data = convert_dolly_vicuna(raw_data) 99 | # new_data = convert_alpaca_vicuna(raw_data) 100 | json_dump(new_data, args.out_file) 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /retrieval_contriever/src/index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import pickle 9 | from typing import List, Tuple 10 | 11 | import faiss 12 | import numpy as np 13 | from tqdm import tqdm 14 | 15 | class Indexer(object): 16 | 17 | def __init__(self, vector_sz, n_subquantizers=0, n_bits=8): 18 | if n_subquantizers > 0: 19 | self.index = faiss.IndexPQ(vector_sz, n_subquantizers, n_bits, faiss.METRIC_INNER_PRODUCT) 20 | else: 21 | self.index = faiss.IndexFlatIP(vector_sz) 22 | #self.index_id_to_db_id = np.empty((0), dtype=np.int64) 23 | self.index_id_to_db_id = [] 24 | 25 | def index_data(self, ids, embeddings): 26 | self._update_id_mapping(ids) 27 | embeddings = embeddings.astype('float32') 28 | if not self.index.is_trained: 29 | self.index.train(embeddings) 30 | self.index.add(embeddings) 31 | 32 | print(f'Total data indexed {len(self.index_id_to_db_id)}') 33 | 34 | def search_knn(self, query_vectors: np.array, top_docs: int, index_batch_size: int = 2048) -> List[Tuple[List[object], List[float]]]: 35 | query_vectors = query_vectors.astype('float32') 36 | result = [] 37 | nbatch = (len(query_vectors)-1) // index_batch_size + 1 38 | for k in tqdm(range(nbatch)): 39 | start_idx = k*index_batch_size 40 | end_idx = min((k+1)*index_batch_size, len(query_vectors)) 41 | q = query_vectors[start_idx: end_idx] 42 | scores, indexes = self.index.search(q, top_docs) 43 | # convert to external ids 44 | db_ids = [[str(self.index_id_to_db_id[i]) for i in query_top_idxs] for query_top_idxs in indexes] 45 | result.extend([(db_ids[i], scores[i]) for i in range(len(db_ids))]) 46 | return result 47 | 48 | def serialize(self, dir_path): 49 | index_file = os.path.join(dir_path, 'index.faiss') 50 | meta_file = os.path.join(dir_path, 'index_meta.faiss') 51 | print(f'Serializing index to {index_file}, meta data to {meta_file}') 52 | 53 | faiss.write_index(self.index, index_file) 54 | with open(meta_file, mode='wb') as f: 55 | pickle.dump(self.index_id_to_db_id, f) 56 | 57 | def deserialize_from(self, dir_path): 58 | index_file = os.path.join(dir_path, 'index.faiss') 59 | meta_file = os.path.join(dir_path, 'index_meta.faiss') 60 | print(f'Loading index from {index_file}, meta data from {meta_file}') 61 | 62 | self.index = faiss.read_index(index_file) 63 | print('Loaded index of type %s and size %d', type(self.index), self.index.ntotal) 64 | 65 | with open(meta_file, "rb") as reader: 66 | self.index_id_to_db_id = pickle.load(reader) 67 | assert len( 68 | self.index_id_to_db_id) == self.index.ntotal, 'Deserialized index_id_to_db_id should match faiss index size' 69 | 70 | def _update_id_mapping(self, db_ids: List): 71 | #new_ids = np.array(db_ids, dtype=np.int64) 72 | #self.index_id_to_db_id = np.concatenate((self.index_id_to_db_id, new_ids), axis=0) 73 | self.index_id_to_db_id.extend(db_ids) -------------------------------------------------------------------------------- /retrieval_contriever/src/inbatch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | import math 7 | import random 8 | import transformers 9 | import logging 10 | import torch.distributed as dist 11 | 12 | from retrieval_contriever.src import contriever, dist_utils, utils 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class InBatch(nn.Module): 18 | def __init__(self, opt, retriever=None, tokenizer=None): 19 | super(InBatch, self).__init__() 20 | 21 | self.opt = opt 22 | self.norm_doc = opt.norm_doc 23 | self.norm_query = opt.norm_query 24 | self.label_smoothing = opt.label_smoothing 25 | if retriever is None or tokenizer is None: 26 | retriever, tokenizer = self._load_retriever( 27 | opt.retriever_model_id, pooling=opt.pooling, random_init=opt.random_init 28 | ) 29 | self.tokenizer = tokenizer 30 | self.encoder = retriever 31 | 32 | def _load_retriever(self, model_id, pooling, random_init): 33 | cfg = utils.load_hf(transformers.AutoConfig, model_id) 34 | tokenizer = utils.load_hf(transformers.AutoTokenizer, model_id) 35 | 36 | if "xlm" in model_id: 37 | model_class = contriever.XLMRetriever 38 | else: 39 | model_class = contriever.Contriever 40 | 41 | if random_init: 42 | retriever = model_class(cfg) 43 | else: 44 | retriever = utils.load_hf(model_class, model_id) 45 | 46 | if "bert-" in model_id: 47 | if tokenizer.bos_token_id is None: 48 | tokenizer.bos_token = "[CLS]" 49 | if tokenizer.eos_token_id is None: 50 | tokenizer.eos_token = "[SEP]" 51 | 52 | retriever.config.pooling = pooling 53 | 54 | return retriever, tokenizer 55 | 56 | def get_encoder(self): 57 | return self.encoder 58 | 59 | def forward(self, q_tokens, q_mask, k_tokens, k_mask, stats_prefix="", iter_stats={}, **kwargs): 60 | 61 | bsz = len(q_tokens) 62 | labels = torch.arange(0, bsz, dtype=torch.long, device=q_tokens.device) 63 | 64 | qemb = self.encoder(input_ids=q_tokens, attention_mask=q_mask, normalize=self.norm_query) 65 | kemb = self.encoder(input_ids=k_tokens, attention_mask=k_mask, normalize=self.norm_doc) 66 | 67 | gather_fn = dist_utils.gather 68 | 69 | gather_kemb = gather_fn(kemb) 70 | 71 | labels = labels + dist_utils.get_rank() * len(kemb) 72 | 73 | scores = torch.einsum("id, jd->ij", qemb / self.opt.temperature, gather_kemb) 74 | 75 | loss = torch.nn.functional.cross_entropy(scores, labels, label_smoothing=self.label_smoothing) 76 | 77 | # log stats 78 | if len(stats_prefix) > 0: 79 | stats_prefix = stats_prefix + "/" 80 | iter_stats[f"{stats_prefix}loss"] = (loss.item(), bsz) 81 | 82 | predicted_idx = torch.argmax(scores, dim=-1) 83 | accuracy = 100 * (predicted_idx == labels).float().mean() 84 | stdq = torch.std(qemb, dim=0).mean().item() 85 | stdk = torch.std(kemb, dim=0).mean().item() 86 | iter_stats[f"{stats_prefix}accuracy"] = (accuracy, bsz) 87 | iter_stats[f"{stats_prefix}stdq"] = (stdq, bsz) 88 | iter_stats[f"{stats_prefix}stdk"] = (stdk, bsz) 89 | 90 | return loss, iter_stats 91 | -------------------------------------------------------------------------------- /source/model/llama2/examples/format_data/convert_vicuna.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from datasets import load_dataset 5 | 6 | sys.path.append('../../') 7 | 8 | from chatllms.data.data_utils import extract_default_prompt_dataset 9 | 10 | 11 | def json_dump(obj, path): 12 | with open(path, 'w', encoding='utf-8') as f: 13 | json.dump(obj, f, indent=2, ensure_ascii=False) 14 | 15 | 16 | def json_load(in_file): 17 | with open(in_file, 'r') as f: 18 | json_data = json.load(f) 19 | return json_data 20 | 21 | 22 | def valid_keys(keys): 23 | for k in ['input', 'output']: 24 | if k not in keys: 25 | return False 26 | return True 27 | 28 | 29 | def remove_unused_columns(dataset): 30 | """Remove columns not named 'input' or 'output'.""" 31 | dataset = dataset.remove_columns([ 32 | col for col in dataset.column_names if col not in ['input', 'output'] 33 | ]) 34 | return dataset 35 | 36 | 37 | def convert_alpaca_vicuna(in_file: str, out_file: str = None): 38 | raw_dataset = load_dataset('json', data_files=in_file)['train'] 39 | raw_dataset = raw_dataset.map(extract_default_prompt_dataset) 40 | 41 | collect_data = [] 42 | for i, content in enumerate(raw_dataset): 43 | prompt = content['input'] 44 | response = content['output'] 45 | 46 | collect_data.append({ 47 | 'id': 48 | f'alpaca_{i}', 49 | 'conversations': [ 50 | { 51 | 'from': 'human', 52 | 'value': prompt 53 | }, 54 | { 55 | 'from': 'gpt', 56 | 'value': response 57 | }, 58 | ], 59 | }) 60 | print(f'Original: {len(raw_dataset)}, Converted: {len(collect_data)}') 61 | json_dump(collect_data, out_file) 62 | return collect_data 63 | 64 | 65 | if __name__ == '__main__': 66 | in_file = '/home/robin/prompt_data/100PoisonMpts/train_alpaca.json' 67 | out_file = '/home/robin/prompt_data/100PoisonMpts/train_vicuna.json' 68 | collect_data = convert_alpaca_vicuna(in_file, out_file) 69 | 70 | data_path = '/home/robin/prompt_data/CValues-Comparison/test_alpaca.json' 71 | out_path = '/home/robin/prompt_data/CValues-Comparison/test_vicuna.json' 72 | convert_alpaca_vicuna(data_path, out_file=out_path) 73 | 74 | data_path = '/home/robin/prompt_data/CValues-Comparison/train_alpaca.json' 75 | out_path = '/home/robin/prompt_data/CValues-Comparison/train_vicuna.json' 76 | convert_alpaca_vicuna(data_path, out_file=out_path) 77 | 78 | data_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json' 79 | out_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_vicnua.json' 80 | convert_alpaca_vicuna(data_path, out_file=out_path) 81 | 82 | data_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json' 83 | out_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json' 84 | convert_alpaca_vicuna(data_path, out_file=out_path) 85 | 86 | data_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json' 87 | out_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json' 88 | convert_alpaca_vicuna(data_path, out_file=out_path) 89 | 90 | data_path = '/home/robin/prompt_data/COIG/train_alpaca.json' 91 | out_path = '/home/robin/prompt_data/COIG/train_vicuna.json' 92 | convert_alpaca_vicuna(data_path, out_file=out_path) 93 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/gen_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass, field 2 | from typing import Any, Dict, Optional 3 | 4 | 5 | @dataclass 6 | class GenerationArguments: 7 | """ 8 | Arguments pertaining to specify the model generation parameters. 9 | """ 10 | # generation parameters 11 | # 是否使用cache 12 | use_cache: Optional[bool] = field(default=True) 13 | # Length arguments 14 | # 最大的新生成的token数量 15 | max_new_tokens: Optional[int] = field( 16 | default=1024, 17 | metadata={ 18 | 'help': 19 | 'Maximum number of new tokens to be generated in evaluation or prediction loops' 20 | 'if predict_with_generate is set.' 21 | }) 22 | # 最少的新生成的token数量 23 | min_new_tokens: Optional[int] = field( 24 | default=0, 25 | metadata={'help': 'Minimum number of new tokens to generate.'}) 26 | # 最大的token数量,会被 max_new_tokens 覆盖 27 | max_length: Optional[int] = field( 28 | default=None, 29 | metadata={ 30 | 'help': 31 | 'The maximum length the generated tokens can have. It can be overridden by max_new_tokens.' 32 | }) 33 | # Generation strategy 34 | # 是否采样 35 | do_sample: Optional[bool] = field( 36 | default=True, 37 | metadata={ 38 | 'help': 39 | 'Whether or not to use sampling, use greedy decoding otherwise.' 40 | }) 41 | # 集束搜索的数量 42 | num_beams: Optional[int] = field( 43 | default=1, 44 | metadata={ 45 | 'help': 'Number of beams for beam search. 1 means no beam search.' 46 | }) 47 | # 集束搜索的组数量 48 | num_beam_groups: Optional[int] = field(default=1) 49 | # 惩罚因子 50 | penalty_alpha: Optional[float] = field(default=None) 51 | # Hyperparameters for logit manipulation 52 | # softmax 函数的温度因子,来调节输出token的分布 53 | temperature: Optional[float] = field( 54 | default=1.0, 55 | metadata={ 56 | 'help': 'The value used to modulate the next token probabilities.' 57 | }) 58 | # top_k随机搜索中的k个最高概率选择 59 | top_k: Optional[int] = field( 60 | default=50, 61 | metadata={ 62 | 'help': 63 | 'The number of highest probability vocabulary tokens to keep for top-k filtering.' 64 | }) 65 | # 核采样参数,top_p最高的前n个(n是变化)概率和为p,从这些n个候选token中随机采样 66 | top_p: Optional[float] = field( 67 | default=1.0, 68 | metadata={ 69 | 'help': 70 | 'The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept.' 71 | }) 72 | # 典型p值 73 | typical_p: Optional[float] = field(default=1.0) 74 | # 丰富性惩罚因子 75 | diversity_penalty: Optional[float] = field(default=0.0) 76 | # 重复性惩罚因子 77 | repetition_penalty: Optional[float] = field( 78 | default=1.0, 79 | metadata={ 80 | 'help': 81 | 'The parameter for repetition penalty. 1.0 means no penalty.' 82 | }) 83 | # 长度惩罚因子 84 | length_penalty: Optional[float] = field( 85 | default=1.0, 86 | metadata={ 87 | 'help': 88 | 'Exponential penalty to the length that is used with beam-based generation.' 89 | }) 90 | # 没有ngram重复的尺度大小 91 | # 一般随机采样的丰富性够了,所以一般不会设置,如果重复很多则设置为2是比较好的选择 92 | no_repeat_ngram_size: Optional[int] = field(default=0) 93 | 94 | def to_dict(self) -> Dict[str, Any]: 95 | args = asdict(self) 96 | if args.get('max_new_tokens', None): 97 | args.pop('max_length', None) 98 | return args 99 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/utils/apply_lora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Apply the LoRA weights on top of a base model. 3 | 4 | Usage: 5 | python3 apply_lora.py --base_model_path ~/model_weights/llama-7b --target_model_path ~/model_weights/baize-7b \ 6 | --lora_path project-baize/baize-lora-7B 7 | 8 | Dependency: 9 | pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b 10 | """ 11 | import argparse 12 | from typing import Tuple 13 | 14 | import torch 15 | from peft import PeftModel 16 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel 17 | 18 | 19 | def apply_lora( 20 | base_model_path: str, 21 | lora_model_path: str, 22 | target_model_path: str = None, 23 | cache_dir: str = None, 24 | use_auth_token: str = True, 25 | trust_remote_code: bool = True, 26 | ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: 27 | """Applies the LoRA adapter to a base model and saves the resulting target model (optional). 28 | 29 | Args: 30 | base_model_path (str): The path to the base model to which the LoRA adapter will be applied. 31 | lora_model_path (str): The path to the LoRA adapter. 32 | target_model_path (str): The path where the target model will be saved (if `save_target_model=True`). 33 | cache_dir (str): The path to the cache directory. 34 | use_auth_token (bool): Whether to use an authentication token when downloading the model. 35 | trust_remote_code (bool): Whether to trust remote code when downloading the model. 36 | 37 | Returns: 38 | Tuple[AutoModelForCausalLM, AutoTokenizer]: A tuple containing the target model and its tokenizer. 39 | 40 | """ 41 | # Load the base model and tokenizer 42 | print(f'Loading the base model from {base_model_path}') 43 | # Set configuration kwargs for tokenizer. 44 | config_kwargs = { 45 | 'cache_dir': cache_dir, 46 | 'use_auth_token': use_auth_token, 47 | 'trust_remote_code': trust_remote_code, 48 | } 49 | 50 | base_model: PreTrainedModel = AutoModelForCausalLM.from_pretrained( 51 | base_model_path, 52 | device_map='auto', 53 | torch_dtype=torch.float16, 54 | low_cpu_mem_usage=True, 55 | **config_kwargs, 56 | ) 57 | 58 | # Load the tokenizer 59 | print(f'Loading the tokenizer from {base_model_path}') 60 | # Due to the name of Transformers' LlamaTokenizer, we have to do this 61 | tokenizer = AutoTokenizer.from_pretrained( 62 | base_model_path, 63 | use_fast=False, 64 | **config_kwargs, 65 | ) 66 | 67 | # Load the LoRA adapter 68 | print(f'Loading the LoRA adapter from {lora_model_path}') 69 | model: PreTrainedModel = PeftModel.from_pretrained(base_model, 70 | lora_model_path) 71 | print('Applying the LoRA to base model') 72 | model = model.merge_and_unload() 73 | 74 | if target_model_path is not None: 75 | print(f'Saving the target model to {target_model_path}') 76 | model.save_pretrained(target_model_path) 77 | tokenizer.save_pretrained(target_model_path) 78 | 79 | return model, tokenizer 80 | 81 | 82 | if __name__ == '__main__': 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument('--base-model-path', type=str, required=True) 85 | parser.add_argument('--target-model-path', type=str, default=None) 86 | parser.add_argument('--lora-model-path', type=str, required=True) 87 | args = parser.parse_args() 88 | 89 | apply_lora(base_model_path=args.base_model_path, 90 | lora_model_path=args.lora_model_path, 91 | target_model_path=args.target_model_path) 92 | -------------------------------------------------------------------------------- /retrieval_contriever/src/dist_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | class Gather(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, x: torch.tensor): 10 | output = [torch.zeros_like(x) for _ in range(dist.get_world_size())] 11 | dist.all_gather(output, x) 12 | return tuple(output) 13 | 14 | @staticmethod 15 | def backward(ctx, *grads): 16 | all_gradients = torch.stack(grads) 17 | dist.all_reduce(all_gradients) 18 | return all_gradients[dist.get_rank()] 19 | 20 | 21 | def gather(x: torch.tensor): 22 | if not dist.is_initialized(): 23 | return x 24 | x_gather = Gather.apply(x) 25 | x_gather = torch.cat(x_gather, dim=0) 26 | return x_gather 27 | 28 | 29 | @torch.no_grad() 30 | def gather_nograd(x: torch.tensor): 31 | if not dist.is_initialized(): 32 | return x 33 | x_gather = [torch.ones_like(x) for _ in range(dist.get_world_size())] 34 | dist.all_gather(x_gather, x, async_op=False) 35 | 36 | x_gather = torch.cat(x_gather, dim=0) 37 | return x_gather 38 | 39 | 40 | @torch.no_grad() 41 | def varsize_gather_nograd(x: torch.Tensor): 42 | """gather tensors of different sizes along the first dimension""" 43 | if not dist.is_initialized(): 44 | return x 45 | 46 | # determine max size 47 | size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int) 48 | allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())] 49 | dist.all_gather(allsizes, size) 50 | max_size = max([size.cpu().max() for size in allsizes]) 51 | 52 | padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device) 53 | padded[: x.shape[0]] = x 54 | output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())] 55 | dist.all_gather(output, padded) 56 | 57 | output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)] 58 | output = torch.cat(output, dim=0) 59 | 60 | return output 61 | 62 | 63 | @torch.no_grad() 64 | def get_varsize(x: torch.Tensor): 65 | """gather tensors of different sizes along the first dimension""" 66 | if not dist.is_initialized(): 67 | return [x.shape[0]] 68 | 69 | # determine max size 70 | size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int) 71 | allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())] 72 | dist.all_gather(allsizes, size) 73 | allsizes = torch.cat(allsizes) 74 | return allsizes 75 | 76 | 77 | def get_rank(): 78 | if not dist.is_available(): 79 | return 0 80 | if not dist.is_initialized(): 81 | return 0 82 | return dist.get_rank() 83 | 84 | 85 | def is_main(): 86 | return get_rank() == 0 87 | 88 | 89 | def get_world_size(): 90 | if not dist.is_initialized(): 91 | return 1 92 | else: 93 | return dist.get_world_size() 94 | 95 | 96 | def barrier(): 97 | if dist.is_initialized(): 98 | dist.barrier() 99 | 100 | 101 | def average_main(x): 102 | if not dist.is_initialized(): 103 | return x 104 | if dist.is_initialized() and dist.get_world_size() > 1: 105 | dist.reduce(x, 0, op=dist.ReduceOp.SUM) 106 | if is_main(): 107 | x = x / dist.get_world_size() 108 | return x 109 | 110 | 111 | def sum_main(x): 112 | if not dist.is_initialized(): 113 | return x 114 | if dist.is_initialized() and dist.get_world_size() > 1: 115 | dist.reduce(x, 0, op=dist.ReduceOp.SUM) 116 | return x 117 | 118 | 119 | def weighted_average(x, count): 120 | if not dist.is_initialized(): 121 | if isinstance(x, torch.Tensor): 122 | x = x.item() 123 | return x, count 124 | t_loss = torch.tensor([x * count]).cuda() 125 | t_total = torch.tensor([count]).cuda() 126 | t_loss = sum_main(t_loss) 127 | t_total = sum_main(t_total) 128 | return (t_loss / t_total).item(), t_total.item() 129 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/model/sample_generate_callback.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from dataclasses import dataclass 3 | from typing import Any, Dict 4 | 5 | from transformers import PreTrainedTokenizer, TrainerCallback 6 | 7 | 8 | @dataclass 9 | class SampleGenerateCallback(TrainerCallback): 10 | """ 11 | A callback that generates text samples from a pre-trained language model during training. 12 | 13 | Args: 14 | tokenizer (PreTrainedTokenizer): The tokenizer used to preprocess inputs. 15 | max_new_tokens (int): The maximum number of tokens to generate in response to each input. 16 | """ 17 | def __init__(self, tokenizer: PreTrainedTokenizer, 18 | generation_config: argparse.Namespace, logger: None): 19 | self.tokenizer = tokenizer 20 | self.generation_config = generation_config 21 | self.logger = logger 22 | 23 | # Define input prompts to generate text from 24 | self.sample_inputs = [ 25 | '用一句话描述地球为什么是独一无二的。', 26 | '中国是否应该推出刺激政策救楼市?', 27 | '如何更好地融入新工作圈子', 28 | '帮我把这段文字转换成鲁迅作品里的语气:昨天上午,算几个数学问题时越算越难受,有想要撕掉草稿纸的冲动思维也变得缓慢,见字忘意,感觉大脑里是一团浆糊,阻力很大。' 29 | '我怀疑自己抑郁又犯了,站起身离开了书桌。走出大门,开始跑步,运动,希望能借此缓解。我不想再吃药,我担心不吃药是否能恢复。稍微运动后,大吃了一顿,路上不停的对自己说,我可以.', 30 | '回来后,感觉似乎确实好一些。', 31 | '给我写一篇大模型的新闻稿', 32 | '你觉得人类哪些工作岗位会被AI替代?', 33 | '请帮我写一封中式婚礼请帖,用于邀请亲朋好友参加我的婚礼!', 34 | '帮我写一篇八百字以上的作文,主题是:当代青年面对时代的挑战如何肩负起民族复兴的伟大任务', 35 | '请仿照李荣浩的风格写一首表现爱情的歌曲,以“辣椒酱”为题。', 36 | '秦王朝时期十大将军是?其主要功绩是什么?', 37 | '帮我写一段广告,关于房产销售的,我们的房子首付低,赠送面积大,还免两年物业费!', 38 | '请帮我设计一个时长为3天的北京旅游行程,行程的内容不要太紧凑,使用地铁作为交通工具,并前往前门、天安门、天坛公园、鸟巢游览,同时预留一天的时间游玩环球影城。', 39 | '一个笼子里面有若干只鸡和兔子,总共有50只脚和18个头,求鸡和兔子各有多少只?', 40 | '生成一篇短篇小说,故事情节为一个年轻人在旅途中遇到了一位神秘的老人,老人告诉他一个令人意想不到的秘密,最终年轻人的生活因此发生了翻天覆地的变化。', 41 | '导师想要我论文的一作,我应该怎么办?', 42 | '我现在很无聊,可以讲点有趣的事情吗?', 43 | '一项工程,甲、乙两队合作20天完成,乙丙两队合作60天完成,丙丁两队合作30完成,甲丁合作多少天完成?', 44 | '如果一位孕妇走上了公交车,但是车上没有空位了。请模拟一位热心乘客给孕妇让座的对话。', 45 | '桃花潭水深千尺,不及汪伦送我情。体现的是怎样的心情?', 46 | '编写一个简单的自动化脚本,用于批量操作文件或目录。脚本功能可以自由选择,如复制、压缩、重命名、删除等。脚本语言可使用Python、Shell、Perl等,代码长度不少于100行。', 47 | '音乐可以洗涤人的灵魂吗?', 48 | ] 49 | 50 | def on_evaluate(self, args: Any, state: Dict[str, Any], control: Any, 51 | **kwargs: Any) -> None: 52 | """ 53 | Generates text samples from the language model during evaluation. 54 | 55 | Args: 56 | args (Any): Trainer arguments, not used in this method. 57 | state (Dict[str, Any]): Trainer state dictionary, not used in this method. 58 | control (Any): Trainer control object, not used in this method. 59 | kwargs (Dict[str, Any]): Keyword arguments passed to the method, including the pre-trained 60 | language model (under the key 'model') and any additional parameters needed for generation. 61 | 62 | Returns: 63 | None 64 | """ 65 | logger = self.logger 66 | logger.info('Generating sample text during evaluation...') 67 | 68 | # Check if the pre-trained language model is available 69 | if 'model' in kwargs: 70 | model = kwargs['model'] 71 | 72 | # Generate text for each input prompt 73 | for instruction in self.sample_inputs: 74 | # Preprocess input prompt and convert to tensor 75 | inputs = f'{instruction}\n\n### Response: ' 76 | inputs = self.tokenizer(inputs, return_tensors='pt') 77 | inputs = inputs.to(model.device) 78 | 79 | # Generate text from input prompt 80 | generation_output = model.generate( 81 | **inputs, 82 | generation_config=self.generation_config, 83 | ) 84 | 85 | # Decode generated text and log it 86 | generated_text = self.tokenizer.decode(generation_output[0]) 87 | logger.info(f'Input prompt: {instruction}') 88 | logger.info(f'Generated text: {generated_text}') 89 | 90 | else: 91 | logger.info( 92 | 'Pre-trained language model not found in kwargs, skipping.') 93 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/model/compute_metrics.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict, List, Tuple, Union 3 | 4 | import jieba 5 | import numpy as np 6 | from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu 7 | from rouge_chinese import Rouge 8 | from transformers import PreTrainedTokenizer 9 | 10 | 11 | @dataclass 12 | class ComputeMetrics: 13 | """ 14 | Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. 15 | Borrowed from: https://github.com/THUDM/ChatGLM-6B/blob/0c2806fea82683349194e21996dd6b3acc3c265b/ptuning/main.py#L307 16 | 17 | """ 18 | def __init__(self, tokenizer: PreTrainedTokenizer) -> None: 19 | """ 20 | Initialize the ComputeMetrics class with a pre-trained tokenizer object. 21 | 22 | Args: 23 | tokenizer (PreTrainedTokenizer): A pre-trained tokenizer object to be used for decoding tokenized sequences. 24 | """ 25 | self.tokenizer = tokenizer 26 | 27 | def __call__( 28 | self, eval_preds: List[Union[np.ndarray, Tuple[np.ndarray]]] 29 | ) -> Dict[str, float]: 30 | """ 31 | Computes evaluation metrics for model predictions. 32 | 33 | Args: 34 | eval_preds (List[Union[np.ndarray, Tuple[np.ndarray]]]): List of tuples containing prediction and label arrays. 35 | 36 | Returns: 37 | Dict[str, float]: A dictionary containing the average of each computed metric over all prediction-label pairs. 38 | """ 39 | 40 | # Extract predictions and labels from input 41 | preds, labels = eval_preds 42 | if isinstance(preds, tuple): 43 | preds = preds[0] 44 | 45 | # Replace IGNORE_INDEX in the labels with pad_token_id as we cannot decode them if ignore_pad_token_for_loss=True. 46 | preds = np.where(preds != self.tokenizer.pad_token_id, preds, 47 | self.tokenizer.pad_token_id) 48 | labels = np.where(labels != self.tokenizer.pad_token_id, labels, 49 | self.tokenizer.pad_token_id) 50 | 51 | score_dict = { 52 | 'rouge-1': [], # numericl 1 53 | 'rouge-2': [], 54 | 'rouge-l': [], # string l 55 | 'bleu-4': [] 56 | } 57 | 58 | # Calculate metrics for each prediction-label pair 59 | for pred, label in zip(preds, labels): 60 | pred = pred[(pred == self.tokenizer.bos_token_id 61 | ).nonzero()[0][0]:] # remove the query 62 | hypothesis = list( 63 | jieba.cut(self.tokenizer.decode(pred, 64 | skip_special_tokens=True))) 65 | reference = list( 66 | jieba.cut( 67 | self.tokenizer.decode(label, skip_special_tokens=True))) 68 | 69 | # If there are no words in the hypothesis, set all scores to 0 70 | if len(' '.join(hypothesis).split()) == 0: 71 | result = { 72 | 'rouge-1': { 73 | 'f': 0.0 74 | }, 75 | 'rouge-2': { 76 | 'f': 0.0 77 | }, 78 | 'rouge-l': { 79 | 'f': 0.0 80 | } 81 | } 82 | else: 83 | rouge = Rouge() 84 | scores = rouge.get_scores(' '.join(hypothesis), 85 | ' '.join(reference)) 86 | result = scores[0] 87 | 88 | # Append scores to score_dict 89 | for k, v in result.items(): 90 | score_dict[k].append(round(v['f'] * 100, 4)) 91 | 92 | # Calculate BLEU-4 score and append it to score_dict 93 | bleu_score = sentence_bleu( 94 | [list(label)], 95 | list(pred), 96 | smoothing_function=SmoothingFunction().method3) 97 | score_dict['bleu-4'].append(round(bleu_score * 100, 4)) 98 | 99 | # Calculate average of each metric over all prediction-label pairs and return as a dictionary 100 | return {k: float(np.mean(v)) for k, v in score_dict.items()} 101 | -------------------------------------------------------------------------------- /retrieval_contriever/src/slurm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from logging import getLogger 8 | import os 9 | import sys 10 | import torch 11 | import socket 12 | import signal 13 | import subprocess 14 | 15 | 16 | logger = getLogger() 17 | 18 | def sig_handler(signum, frame): 19 | logger.warning("Signal handler called with signal " + str(signum)) 20 | prod_id = int(os.environ['SLURM_PROCID']) 21 | logger.warning("Host: %s - Global rank: %i" % (socket.gethostname(), prod_id)) 22 | if prod_id == 0: 23 | logger.warning("Requeuing job " + os.environ['SLURM_JOB_ID']) 24 | os.system('scontrol requeue ' + os.environ['SLURM_JOB_ID']) 25 | else: 26 | logger.warning("Not the main process, no need to requeue.") 27 | sys.exit(-1) 28 | 29 | 30 | def term_handler(signum, frame): 31 | logger.warning("Signal handler called with signal " + str(signum)) 32 | logger.warning("Bypassing SIGTERM.") 33 | 34 | 35 | def init_signal_handler(): 36 | """ 37 | Handle signals sent by SLURM for time limit / pre-emption. 38 | """ 39 | signal.signal(signal.SIGUSR1, sig_handler) 40 | signal.signal(signal.SIGTERM, term_handler) 41 | 42 | 43 | def init_distributed_mode(params): 44 | """ 45 | Handle single and multi-GPU / multi-node / SLURM jobs. 46 | Initialize the following variables: 47 | - local_rank 48 | - global_rank 49 | - world_size 50 | """ 51 | is_slurm_job = 'SLURM_JOB_ID' in os.environ and not 'WORLD_SIZE' in os.environ 52 | has_local_rank = hasattr(params, 'local_rank') 53 | 54 | # SLURM job without torch.distributed.launch 55 | if is_slurm_job and has_local_rank: 56 | 57 | assert params.local_rank == -1 # on the cluster, this is handled by SLURM 58 | 59 | # local rank on the current node / global rank 60 | params.local_rank = int(os.environ['SLURM_LOCALID']) 61 | params.global_rank = int(os.environ['SLURM_PROCID']) 62 | params.world_size = int(os.environ['SLURM_NTASKS']) 63 | 64 | # define master address and master port 65 | hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', os.environ['SLURM_JOB_NODELIST']]) 66 | params.main_addr = hostnames.split()[0].decode('utf-8') 67 | assert 10001 <= params.main_port <= 20000 or params.world_size == 1 68 | 69 | # set environment variables for 'env://' 70 | os.environ['MASTER_ADDR'] = params.main_addr 71 | os.environ['MASTER_PORT'] = str(params.main_port) 72 | os.environ['WORLD_SIZE'] = str(params.world_size) 73 | os.environ['RANK'] = str(params.global_rank) 74 | is_distributed = True 75 | 76 | 77 | # multi-GPU job (local or multi-node) - jobs started with torch.distributed.launch 78 | elif has_local_rank and params.local_rank != -1: 79 | 80 | assert params.main_port == -1 81 | 82 | # read environment variables 83 | params.global_rank = int(os.environ['RANK']) 84 | params.world_size = int(os.environ['WORLD_SIZE']) 85 | 86 | is_distributed = True 87 | 88 | # local job (single GPU) 89 | else: 90 | params.local_rank = 0 91 | params.global_rank = 0 92 | params.world_size = 1 93 | is_distributed = False 94 | 95 | # set GPU device 96 | torch.cuda.set_device(params.local_rank) 97 | 98 | # initialize multi-GPU 99 | if is_distributed: 100 | 101 | # http://pytorch.apachecn.org/en/0.3.0/distributed.html#environment-variable-initialization 102 | # 'env://' will read these environment variables: 103 | # MASTER_PORT - required; has to be a free port on machine with rank 0 104 | # MASTER_ADDR - required (except for rank 0); address of rank 0 node 105 | # WORLD_SIZE - required; can be set either here, or in a call to init function 106 | # RANK - required; can be set either here, or in a call to init function 107 | 108 | #print("Initializing PyTorch distributed ...") 109 | torch.distributed.init_process_group( 110 | init_method='env://', 111 | backend='nccl', 112 | #world_size=params.world_size, 113 | #rank=params.global_rank, 114 | ) -------------------------------------------------------------------------------- /source/model/llama2/train_qlora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | 5 | import torch 6 | import transformers 7 | from transformers import GenerationConfig, Trainer, set_seed 8 | 9 | from chatllms.configs import (DataArguments, GenerationArguments, 10 | LoraArguments, ModelArguments, QuantArguments, 11 | TrainingArguments) 12 | from chatllms.data import make_supervised_data_module 13 | from chatllms.model import (MMLUEvalCallback, SampleGenerateCallback, 14 | SavePeftModelCallback, load_model_tokenizer) 15 | from chatllms.train.training import train_and_evaluate 16 | from chatllms.utils.logger_utils import get_root_logger 17 | from chatllms.utils.model_utils import (check_training_finished, 18 | print_trainable_parameters, 19 | verify_dtypes) 20 | 21 | torch.backends.cuda.matmul.allow_tf32 = True 22 | 23 | 24 | def main(): 25 | parser = transformers.HfArgumentParser( 26 | (ModelArguments, DataArguments, TrainingArguments, LoraArguments, 27 | QuantArguments, GenerationArguments)) 28 | (model_args, data_args, training_args, lora_args, quant_args, 29 | generation_args) = parser.parse_args_into_dataclasses() 30 | # Check arguments (do not check finetuning_args since it may be loaded from checkpoints) 31 | data_args.init_for_training() 32 | training_args.generation_config = GenerationConfig(**vars(generation_args)) 33 | 34 | args = argparse.Namespace(**vars(model_args), **vars(data_args), 35 | **vars(training_args), **vars(lora_args), 36 | **vars(quant_args)) 37 | # init the logger before other steps 38 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 39 | if not os.path.exists(args.output_dir): 40 | os.makedirs(args.output_dir) 41 | log_file = os.path.join(args.output_dir, f'{timestamp}.log') 42 | logger = get_root_logger(log_file=log_file, log_level='INFO') 43 | 44 | # Log on each process the small summary: 45 | logger.info( 46 | f'Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}' 47 | + 48 | f'distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}' 49 | ) 50 | logger.info('Training/evaluation parameters %s', args) 51 | # Check if training was already completed. 52 | checkpoint_dir, completed_training = check_training_finished(args, logger) 53 | args.resume_checkpoint = checkpoint_dir 54 | 55 | # load model and tokenizer 56 | model, tokenizer = load_model_tokenizer( 57 | args=args, 58 | checkpoint_dir=checkpoint_dir, 59 | is_trainable=args.do_train, 60 | logger=logger, 61 | ) 62 | logger.info('Loaded model...') 63 | 64 | logger.info('Printing trainable parameters...') 65 | print_trainable_parameters(args, model) 66 | 67 | set_seed(args.seed) 68 | 69 | # Verify dtypes 70 | logger.info('Verifying dtypes...') 71 | verify_dtypes(model) 72 | 73 | data_module = make_supervised_data_module(tokenizer=tokenizer, args=args) 74 | trainer = Trainer(model=model, 75 | tokenizer=tokenizer, 76 | args=training_args, 77 | **data_module) 78 | # Add callback to save adapter model. 79 | if not args.full_finetune: 80 | trainer.add_callback(SavePeftModelCallback) 81 | 82 | # Add callback to generate samples. 83 | if args.sample_generate: 84 | trainer.add_callback( 85 | SampleGenerateCallback( 86 | tokenizer=tokenizer, 87 | generation_config=GenerationConfig(**vars(generation_args)), 88 | logger=logger, 89 | )) 90 | 91 | if args.do_mmlu_eval: 92 | eval_callback = MMLUEvalCallback( 93 | trainer=trainer, 94 | tokenizer=tokenizer, 95 | data_dir='./data', 96 | args=args, 97 | ) 98 | trainer.add_callback(eval_callback) 99 | 100 | assert args.do_train or args.do_eval or args.do_predict 101 | if args.do_train or args.do_eval: 102 | train_and_evaluate(trainer, args, logger) 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/model/save_peft_model_callback.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Dict 3 | 4 | from transformers import (PreTrainedModel, TrainerCallback, TrainerControl, 5 | TrainingArguments) 6 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 7 | 8 | 9 | class SavePeftModelCallback(TrainerCallback): 10 | """ 11 | Callback to save PEFT model checkpoints during training. 12 | 13 | Saves both the full model and the adapter model to separate directories 14 | within the checkpoint directory. 15 | """ 16 | def save_model(self, args: Any, state: TrainingArguments, 17 | kwargs: Dict[str, Any]) -> None: 18 | """ 19 | Saves the PEFT model checkpoint. 20 | 21 | Args: 22 | args (Any): The command line arguments passed to the script. 23 | state (TrainingArguments): The current state of training. 24 | kwargs (Dict[str, Any]): A dictionary of additional keyword arguments. 25 | 26 | Raises: 27 | TypeError: If `state` is not an instance of `TrainingArguments`. 28 | """ 29 | print('+' * 20, 'Saving PEFT Model Checkpoint CallBack', '+' * 20) 30 | 31 | # Get the checkpoint directory for saving models. 32 | if state.best_model_checkpoint is not None: 33 | # If best model checkpoint exists, use its directory as the checkpoint folder 34 | checkpoint_dir = os.path.join(state.best_model_checkpoint, 35 | 'adapter_model') 36 | else: 37 | # Otherwise, create a new checkpoint folder using the output directory and current global step 38 | checkpoint_dir = os.path.join( 39 | args.output_dir, 40 | f'{PREFIX_CHECKPOINT_DIR}-{state.global_step}') 41 | 42 | # Create path for the PEFT model 43 | peft_model_path = os.path.join(checkpoint_dir, 'adapter_model') 44 | model: PreTrainedModel = kwargs['model'] 45 | model.save_pretrained(peft_model_path) 46 | 47 | # Create path for the PyTorch model binary file and remove it if it already exists 48 | pytorch_model_path = os.path.join(checkpoint_dir, 'pytorch_model.bin') 49 | if os.path.exists(pytorch_model_path): 50 | os.remove(pytorch_model_path) 51 | 52 | def on_save(self, args: Any, state: TrainingArguments, 53 | control: TrainerControl, 54 | **kwargs: Dict[str, Any]) -> TrainerControl: 55 | """ 56 | Callback method that calls save_model() and returns `control` argument. 57 | 58 | Args: 59 | args (Any): The command line arguments passed to the script. 60 | state (TrainingArguments): The current state of training. 61 | control (trainer_callback.TrainerControl): \ 62 | The current state of the TrainerCallback's control flow. 63 | kwargs (Dict[str, Any]): A dictionary of additional keyword arguments. 64 | 65 | Returns: 66 | trainer_callback.TrainerControl: The current state of the TrainerCallback's control flow. 67 | 68 | Raises: 69 | TypeError: If `state` is not an instance of `TrainingArguments`. 70 | """ 71 | self.save_model(args, state, kwargs) 72 | return control 73 | 74 | def on_train_end(self, args: Any, state: TrainingArguments, 75 | control: TrainerControl, **kwargs: Dict[str, 76 | Any]) -> None: 77 | """ 78 | Callback method that saves the model checkpoint and creates a 'completed' file in the output directory. 79 | 80 | Args: 81 | args (Any): The command line arguments passed to the script. 82 | state (TrainingArguments): The current state of training. 83 | control (trainer_callback.TrainerControl): \ 84 | The current state of the TrainerCallback's control flow. 85 | kwargs (Dict[str, Any]): A dictionary of additional keyword arguments. 86 | 87 | Raises: 88 | TypeError: If `state` is not an instance of `TrainingArguments`. 89 | """ 90 | 91 | # Define a helper function to create a 'completed' file in the output directory 92 | def touch(fname, times=None): 93 | with open(fname, 'a'): 94 | os.utime(fname, times) 95 | 96 | # Create the 'completed' file in the output directory 97 | touch(os.path.join(args.output_dir, 'completed')) 98 | -------------------------------------------------------------------------------- /source/model/llama2/server/single_chat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import sys 4 | from threading import Thread 5 | from typing import List 6 | 7 | import torch 8 | import transformers 9 | from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, 10 | PreTrainedTokenizer, TextIteratorStreamer) 11 | 12 | sys.path.append('../') 13 | from chatllms.configs import GenerationArguments, ModelInferenceArguments 14 | from chatllms.utils.model_utils import get_logits_processor 15 | 16 | 17 | def generate_response(query: str, tokenizer: PreTrainedTokenizer, 18 | model: PreTrainedModel, 19 | generation_args: dict) -> List[str]: 20 | """ 21 | Generates a response to the given query using GPT-3.5 model and prints it to the console. 22 | 23 | Args: 24 | query (str): The input query for which a response is to be generated. 25 | tokenizer (PreTrainedTokenizer): The tokenizer used to convert the raw text into input tokens. 26 | model (PreTrainedModel): The GPT-3.5 model used to generate the response. 27 | generation_args (dict): A dictionary containing the arguments to be passed to the generate() method of the model. 28 | 29 | Returns: 30 | List[Tuple[str, str]]: A list of all the previous queries and their responses, including the current one. 31 | """ 32 | 33 | # Convert the query and history into input IDs 34 | inputs = tokenizer(query, return_tensors='pt', add_special_tokens=False) 35 | inputs = {k: v.to(model.device) for k, v in inputs.items()} 36 | 37 | # Create a TextIteratorStreamer object to stream the response from the model 38 | streamer = TextIteratorStreamer(tokenizer, 39 | timeout=60.0, 40 | skip_prompt=True, 41 | skip_special_tokens=True) 42 | 43 | # Set the arguments for the model's generate() method 44 | gen_kwargs = dict( 45 | **inputs, 46 | streamer=streamer, 47 | logits_processor=get_logits_processor(), 48 | **generation_args.to_dict(), 49 | ) 50 | 51 | # Start a separate thread to generate the response asynchronously 52 | thread = Thread(target=model.generate, kwargs=gen_kwargs) 53 | thread.start() 54 | 55 | # Print the model name and the response as it is generated 56 | print('Assistant: ', end='', flush=True) 57 | response = '' 58 | for new_text in streamer: 59 | print(new_text, end='', flush=True) 60 | response += new_text 61 | # Update the history with the current query and response and return it 62 | return response 63 | 64 | 65 | def main(): 66 | """ 67 | 单轮对话,不具有对话历史的记忆功能 68 | Run conversational agent loop with input/output. 69 | 70 | Args: 71 | model_args: Arguments for loading model 72 | gen_args: Arguments for model.generate() 73 | 74 | Returns: 75 | None 76 | """ 77 | 78 | # Parse command-line arguments 79 | parser = transformers.HfArgumentParser( 80 | (ModelInferenceArguments, GenerationArguments)) 81 | model_server_args, generation_args = parser.parse_args_into_dataclasses() 82 | 83 | # Load the pretrained language model. 84 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 85 | 86 | model = AutoModelForCausalLM.from_pretrained( 87 | model_server_args.model_name_or_path, 88 | trust_remote_code=True, 89 | low_cpu_mem_usage=True, 90 | torch_dtype=torch.float16, 91 | device_map='auto').to(device).eval() 92 | 93 | tokenizer = AutoTokenizer.from_pretrained( 94 | model_server_args.model_name_or_path, 95 | trust_remote_code=True, 96 | use_fast=False, 97 | ) 98 | 99 | os_name = platform.system() 100 | clear_command = 'cls' if os_name == 'Windows' else 'clear' 101 | # Set the arguments for the model's generate() method 102 | print('欢迎使用 CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序') 103 | input_pattern = '{}' 104 | while True: 105 | query = input('\nUser: ') 106 | if query.strip() == 'stop': 107 | break 108 | 109 | if query.strip() == 'clear': 110 | os.system(clear_command) 111 | print('History has been removed.') 112 | print('欢迎使用CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序') 113 | continue 114 | 115 | query = input_pattern.format(query) 116 | # Perform prediction and printing 117 | generate_response(query, tokenizer, model, generation_args) 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /source/model/llama2/server/gradio_base_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gradio as gr 4 | import torch 5 | from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig 6 | 7 | from chatllms.utils.apply_lora import apply_lora 8 | 9 | 10 | def args_parser(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model_name_or_path', 13 | default=None, 14 | type=str, 15 | required=True, 16 | help='Path to pre-trained model') 17 | parser.add_argument('--lora_model_name_or_path', 18 | default=None, 19 | type=str, 20 | help='Path to pre-trained model') 21 | parser.add_argument('--no_cuda', 22 | action='store_true', 23 | help='Avoid using CUDA when available') 24 | parser.add_argument('--load_8bit', 25 | action='store_true', 26 | help='Whether to use load_8bit instead of 32-bit') 27 | args = parser.parse_args() 28 | 29 | args.device = torch.device( 30 | 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') 31 | return args 32 | 33 | 34 | def main(args): 35 | if args.lora_model_name_or_path is not None: 36 | model, tokenizer = apply_lora(args.model_name_or_path, 37 | args.lora_model_name_or_path, 38 | load_8bit=args.load_8bit) 39 | else: 40 | tokenizer = AutoTokenizer.from_pretrained( 41 | pretrained_model_name_or_path=args.model_name_or_path, 42 | trust_remote_code=True) 43 | model = AutoModelForCausalLM.from_pretrained( 44 | pretrained_model_name_or_path=args.model_name_or_path, 45 | load_in_8bit=args.load_8bit, 46 | torch_dtype=torch.float16, 47 | device_map='auto', 48 | trust_remote_code=True) 49 | 50 | def evaluate( 51 | input=None, 52 | temperature=0.8, 53 | top_p=0.75, 54 | top_k=40, 55 | max_new_tokens=128, 56 | **kwargs, 57 | ): 58 | inputs = tokenizer(input, return_tensors='pt') 59 | inputs = inputs.to(args.device) 60 | generation_config = GenerationConfig( 61 | temperature=temperature, 62 | top_p=top_p, 63 | top_k=top_k, 64 | do_sample=True, 65 | no_repeat_ngram_size=6, 66 | repetition_penalty=1.8, 67 | **kwargs, 68 | ) 69 | # Without streaming 70 | with torch.no_grad(): 71 | generation_output = model.generate( 72 | **inputs, 73 | generation_config=generation_config, 74 | return_dict_in_generate=True, 75 | output_scores=True, 76 | max_new_tokens=max_new_tokens, 77 | ) 78 | s = generation_output.sequences[0] 79 | output = tokenizer.decode(s, skip_special_tokens=True) 80 | yield output 81 | 82 | description = 'Baichuan7B is a 7B-parameter LLaMA model finetuned to follow instructions.' 83 | server = gr.Interface( 84 | fn=evaluate, 85 | inputs=[ 86 | gr.components.Textbox(lines=2, label='Input', placeholder='none'), 87 | gr.components.Slider(minimum=0, 88 | maximum=1, 89 | value=0.1, 90 | label='Temperature'), 91 | gr.components.Slider(minimum=0, 92 | maximum=1, 93 | value=0.75, 94 | label='Top p'), 95 | gr.components.Slider(minimum=0, 96 | maximum=100, 97 | step=1, 98 | value=40, 99 | label='Top k'), 100 | gr.components.Slider(minimum=1, 101 | maximum=2000, 102 | step=1, 103 | value=128, 104 | label='Max tokens'), 105 | ], 106 | outputs=[gr.inputs.Textbox( 107 | lines=5, 108 | label='Output', 109 | )], 110 | title='Baichuan7B', 111 | description=description, 112 | ) 113 | 114 | server.queue().launch(server_name='0.0.0.0', share=False) 115 | 116 | 117 | if __name__ == '__main__': 118 | args = args_parser() 119 | main(args) 120 | -------------------------------------------------------------------------------- /source/model/llama2/data/dataset_info.yaml: -------------------------------------------------------------------------------- 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments. 2 | alpaca: 3 | hf_hub_url: tatsu-lab/alpaca 4 | local_path: tatsu-lab/alpaca/alpaca.json 5 | dataset_format: alpaca 6 | multi_turn: False 7 | 8 | alpaca-clean: 9 | hf_hub_url: yahma/alpaca-cleaned 10 | local_path: '' 11 | dataset_format: alpaca 12 | multi_turn: False 13 | 14 | coig: 15 | hf_hub_url: BAAI/COIG 16 | local_path: /home/robin/prompt_data//COIG/train_alpaca.json 17 | dataset_format: alpaca 18 | multi_turn: False 19 | 20 | dolly-15k: 21 | hf_hub_url: databricks/databricks-dolly-15k 22 | local_path: databricks/databricks-dolly-15k 23 | dataset_format: dolly 24 | multi_turn: False 25 | 26 | cvalues_comparison_train: 27 | hf_hub_url: '' 28 | local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json 29 | dataset_format: alpaca 30 | multi_turn: False 31 | 32 | cvalues_comparison_test: 33 | hf_hub_url: '' 34 | local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json 35 | dataset_format: alpaca 36 | multi_turn: False 37 | 38 | guanaco: 39 | hf_hub_url: JosephusCheung/GuanacoDataset 40 | local_path: '' 41 | dataset_format: guanaco 42 | multi_turn: False 43 | 44 | hh-rlhf: 45 | hf_hub_url: Anthropic/hh-rlhf 46 | local_path: '' 47 | dataset_format: hh-rlhf 48 | multi_turn: False 49 | 50 | huatuogpt: 51 | hf_hub_url: FreedomIntelligence/HuatuoGPT-sft-data-v1 52 | local_path: /home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.jsonl 53 | dataset_format: alpaca 54 | multi_turn: False 55 | 56 | openassistant-guanaco: 57 | hf_hub_url: timdettmers/openassistant-guanaco 58 | local_path: /home/robin/prompt_data/timdettmers/openassistant-guanaco 59 | dataset_format: alpaca 60 | multi_turn: False 61 | 62 | olcc: 63 | hf_hub_url: '' 64 | local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json 65 | dataset_format: alpaca 66 | multi_turn: False 67 | 68 | 100PoisonMpts: 69 | hf_hub_url: 'damo/100PoisonMpts' 70 | local_path: /home/robin/prompt_data/100PoisonMpts/train.jsonl 71 | dataset_format: 100PoisonMpts 72 | multi_turn: False 73 | 74 | safety_prompt_part1: 75 | hf_hub_url: '' 76 | local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json 77 | dataset_format: alpaca 78 | multi_turn: False 79 | 80 | safety_prompt_part2: 81 | hf_hub_url: '' 82 | local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json 83 | dataset_format: alpaca 84 | multi_turn: False 85 | 86 | # Belle Group 87 | belle_0.5m: 88 | hf_hub_url: BelleGroup/train_0.5M_CN 89 | local_path: '' 90 | dataset_format: alpaca 91 | multi_turn: False 92 | 93 | belle_1m: 94 | hf_hub_url: BelleGroup/train_1M_CN 95 | local_path: '' 96 | dataset_format: alpaca 97 | multi_turn: False 98 | 99 | belle_2m: 100 | hf_hub_url: BelleGroup/train_2M_CN 101 | local_path: '' 102 | dataset_format: alpaca 103 | multi_turn: False 104 | 105 | belle_dialog: 106 | hf_hub_url: BelleGroup/generated_chat_0.4M 107 | local_path: '' 108 | dataset_format: belle_dialog 109 | multi_turn: False 110 | 111 | belle_math: 112 | hf_hub_url: BelleGroup/school_math_0.25M 113 | local_path: '' 114 | dataset_format: alpaca 115 | multi_turn: False 116 | 117 | belle_multiturn: 118 | hf_hub_url: BelleGroup/multi_turn_0.5M 119 | local_path: '' 120 | dataset_format: belle_multiturn 121 | multi_turn: True 122 | columns: 123 | prompt: instruction 124 | query: '' 125 | response: output 126 | history: history 127 | 128 | # firefly 129 | firefly: 130 | hf_hub_url: YeungNLP/firefly-train-1.1M 131 | local_path: '' 132 | dataset_format: alpaca 133 | multi_turn: False 134 | columns: 135 | prompt: input 136 | query: '' 137 | response: target 138 | history: '' 139 | 140 | # CodeAlpaca 141 | codealpaca: 142 | hf_hub_url: sahil2801/CodeAlpaca-20k 143 | local_path: '' 144 | dataset_format: codealpaca 145 | multi_turn: False 146 | 147 | # alpacacot 148 | alpaca_cot: 149 | hf_hub_url: QingyiSi/Alpaca-CoT 150 | local_path: '' 151 | multi_turn: False 152 | 153 | webqa: 154 | hf_hub_url: suolyer/webqa 155 | local_path: '' 156 | dataset_format: webqa 157 | multi_turn: False 158 | columns: 159 | prompt: input 160 | query: '' 161 | response: output 162 | history: '' 163 | 164 | # mutli-turn datasets 165 | evol_instruct: 166 | hf_hub_url: WizardLM/WizardLM_evol_instruct_V2_196k 167 | local_path: WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json 168 | dataset_format: sharegpt 169 | multi_turn: True 170 | 171 | share_gpt: 172 | hf_hub_url: '' 173 | local_path: /home/robin/prompt_data/sharegpt/sharegpt_split.json 174 | dataset_format: sharegpt 175 | multi_turn: True 176 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/train/training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | from typing import Any, Dict 6 | 7 | import numpy as np 8 | import transformers 9 | from torch.utils.data import Dataset 10 | 11 | 12 | def train_and_evaluate(trainer: transformers.Trainer, args: argparse.Namespace, 13 | logger: None) -> None: 14 | """ 15 | Trains and evaluates a machine learning model. 16 | 17 | Args: 18 | trainer (Trainer): The training object to use for training and evaluation. 19 | args (argparse.Namespace): The command line arguments for the current run. 20 | Returns: 21 | None 22 | """ 23 | # Create dictionary to store metrics 24 | all_metrics: Dict[str, Any] = {'run_name': args.run_name} 25 | 26 | # Training 27 | if args.do_train: 28 | logger.info('=' * 80) 29 | logger.info('*** Train ***') 30 | logger.info('=' * 80) 31 | train_result = trainer.train( 32 | resume_from_checkpoint=args.resume_checkpoint) 33 | metrics = train_result.metrics 34 | 35 | metrics['train_samples'] = len(trainer.train_dataset) 36 | 37 | # Log and save training metrics 38 | trainer.log_metrics('train', metrics) 39 | trainer.save_metrics('train', metrics) 40 | trainer.save_state() 41 | 42 | # Update metrics dictionary with training metrics 43 | all_metrics.update(metrics) 44 | 45 | # Evaluation 46 | if args.do_eval: 47 | logger.info('=' * 80) 48 | logger.info('*** Evaluate ***') 49 | logger.info('=' * 80) 50 | 51 | # Evaluate the trained model and obtain evaluation metrics 52 | metrics = trainer.evaluate(metric_key_prefix='eval') 53 | 54 | try: 55 | perplexity = math.exp(metrics['eval_loss']) 56 | except OverflowError: 57 | perplexity = float('inf') 58 | 59 | metrics['perplexity'] = perplexity 60 | metrics['eval_samples'] = len(trainer.eval_dataset) 61 | # Log and save evaluation metrics 62 | trainer.log_metrics('eval', metrics) 63 | trainer.save_metrics('eval', metrics) 64 | 65 | # Update metrics dictionary with evaluation metrics 66 | all_metrics.update(metrics) 67 | 68 | # Save all metrics to a json file 69 | if args.do_train or args.do_eval: 70 | with open(os.path.join(args.output_dir, 'metrics.json'), 'w') as fout: 71 | fout.write(json.dumps(all_metrics)) 72 | 73 | 74 | def predict_and_save(trainer: transformers.Trainer, 75 | tokenizer: transformers.PreTrainedTokenizer, 76 | predict_dataset: Dataset, args: argparse.Namespace, 77 | logger: None) -> None: 78 | """ 79 | Make predictions on new data, save them to a file along with input examples, 80 | and update the overall metrics. 81 | """ 82 | logger.info('=' * 80) 83 | logger.info('*** Predict ***') 84 | logger.info('=' * 80) 85 | data_dict = predict_dataset.dataset 86 | 87 | # Make predictions on the test dataset 88 | prediction_output = trainer.predict(test_dataset=predict_dataset, 89 | metric_key_prefix='predict') 90 | 91 | # Get the predictions and metrics 92 | prediction_metrics = prediction_output.metrics 93 | predictions = prediction_output.predictions 94 | 95 | # Replace -100 values with pad token ID and decode predictions 96 | predictions = np.where(predictions != -100, predictions, 97 | tokenizer.pad_token_id) 98 | predictions = tokenizer.batch_decode(predictions, 99 | skip_special_tokens=True, 100 | clean_up_tokenization_spaces=True) 101 | 102 | data_dict = predict_dataset.dataset 103 | # Create dictionary to store metrics 104 | all_metrics: Dict[str, Any] = {'run_name': args.run_name} 105 | # Write predictions and input examples to file 106 | with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout: 107 | for i, example in enumerate(data_dict): 108 | example['prediction_with_input'] = predictions[i].strip() 109 | example['prediction'] = predictions[i].replace( 110 | example['input'], '').strip() 111 | fout.write(json.dumps(example) + '\n') 112 | 113 | # Print and log the prediction metrics 114 | print(prediction_metrics) 115 | trainer.log_metrics('predict', prediction_metrics) 116 | trainer.save_metrics('predict', prediction_metrics) 117 | 118 | # Update the overall metrics 119 | all_metrics.update(prediction_metrics) 120 | 121 | # Save the overall metrics to a file 122 | with open(os.path.join(args.output_dir, 'eval_metrics.json'), 'w') as fout: 123 | fout.write(json.dumps(all_metrics)) 124 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/configs/data_args.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | from typing import List, Optional 4 | 5 | import yaml 6 | 7 | 8 | @dataclass 9 | class DatasetAttr(object): 10 | 11 | dataset_name: Optional[str] = None 12 | hf_hub_url: Optional[str] = None 13 | local_path: Optional[str] = None 14 | dataset_format: Optional[str] = None 15 | load_from_local: bool = False 16 | multi_turn: Optional[bool] = False 17 | 18 | def __repr__(self) -> str: 19 | rep = (f'dataset_name: {self.dataset_name} || ' 20 | f'hf_hub_url: {self.hf_hub_url} || ' 21 | f'local_path: {self.local_path} \n' 22 | f'data_formate: {self.dataset_format} || ' 23 | f'load_from_local: {self.load_from_local} || ' 24 | f'multi_turn: {self.multi_turn}') 25 | return rep 26 | 27 | def __post_init__(self): 28 | self.prompt_column = 'instruction' 29 | self.query_column = 'input' 30 | self.response_column = 'output' 31 | self.history_column = None 32 | 33 | 34 | @dataclass 35 | class DataArguments: 36 | dataset_cfg: Optional[str] = field( 37 | default='./data/alpaca_zh.yaml', 38 | metadata={ 39 | 'help': 40 | 'Path to dataset infos, please refer to `./data/README.md` to see how to prepare your datasets for training.' 41 | }) 42 | instruction_template: str = field( 43 | default='default', 44 | metadata={ 45 | 'help': 46 | 'Which template to use for constructing prompts in training and inference.' 47 | }) 48 | conversation_template: str = field( 49 | default='default', 50 | metadata={ 51 | 'help': 52 | 'Which template to use for constructing prompts in multi-turn dataset training and inference.' 53 | }) 54 | # 验证数据集的尺寸,也就是数量 55 | eval_dataset_size: Optional[float] = field( 56 | default=0.1, metadata={'help': 'Size of validation dataset.'}) 57 | # 最大训练数据样本的数量。主要是为了快速调试训练代码 58 | max_train_samples: Optional[int] = field( 59 | default=None, 60 | metadata={ 61 | 'help': 62 | 'For debugging purposes or quicker training, truncate the number of training examples to this ' 63 | 'value if set.' 64 | }, 65 | ) 66 | # 与max_train_samples类似,主要是为了快速调试训练代码 67 | max_eval_samples: Optional[int] = field( 68 | default=None, 69 | metadata={ 70 | 'help': 71 | 'For debugging purposes or quicker training, truncate the number of evaluation examples to this ' 72 | 'value if set.' 73 | }, 74 | ) 75 | 76 | def init_for_training(self): # support mixing multiple datasets 77 | assert self.dataset_cfg is not None and os.path.exists( 78 | self.dataset_cfg 79 | ), f'{self.dataset_cfg} does not exist!, please check the path.' 80 | datasets_info = yaml.safe_load(open(self.dataset_cfg, 'r')) 81 | self.dataset_names = list(datasets_info.keys()) 82 | self.dataset_attr_list: List[DatasetAttr] = [] 83 | for i, name in enumerate(self.dataset_names): 84 | dataset_attr = DatasetAttr() 85 | dataset_attr.dataset_name = name 86 | dataset_attr.dataset_format = datasets_info[name].get( 87 | 'dataset_format', None) 88 | dataset_attr.hf_hub_url = datasets_info[name].get( 89 | 'hf_hub_url', None) 90 | dataset_attr.local_path = datasets_info[name].get( 91 | 'local_path', None) 92 | dataset_attr.multi_turn = datasets_info[name].get( 93 | 'multi_turn', False) 94 | 95 | if datasets_info[name]['local_path'] and os.path.exists( 96 | datasets_info[name]['local_path']): 97 | dataset_attr.load_from_local = True 98 | else: 99 | dataset_attr.load_from_local = False 100 | raise Warning( 101 | 'You have set local_path: {} for {} but it does not exist! Will load the data from {}' 102 | .format(name, dataset_attr.local_path, 103 | dataset_attr.hf_hub_url)) 104 | 105 | if 'columns' in datasets_info[name]: 106 | dataset_attr.prompt_column = datasets_info[name][ 107 | 'columns'].get('prompt', None) 108 | dataset_attr.query_column = datasets_info[name]['columns'].get( 109 | 'query', None) 110 | dataset_attr.response_column = datasets_info[name][ 111 | 'columns'].get('response', None) 112 | dataset_attr.history_column = datasets_info[name][ 113 | 'columns'].get('history', None) 114 | 115 | self.dataset_attr_list.append(dataset_attr) 116 | -------------------------------------------------------------------------------- /source/model/llama2/chatllms/utils/logger_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch.distributed as dist 4 | 5 | logger_initialized: dict = {} 6 | 7 | 8 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 9 | """Initialize and get a logger by name. 10 | 11 | If the logger has not been initialized, this method will initialize the 12 | logger by adding one or two handlers, otherwise the initialized logger will 13 | be directly returned. During initialization, a StreamHandler will always be 14 | added. If `log_file` is specified and the process rank is 0, a FileHandler 15 | will also be added. 16 | 17 | Args: 18 | name (str): Logger name. 19 | log_file (str | None): The log filename. If specified, a FileHandler 20 | will be added to the logger. 21 | log_level (int): The logger level. Note that only the process of 22 | rank 0 is affected, and other processes will set the level to 23 | "Error" thus be silent most of the time. 24 | file_mode (str): The file mode used in opening log file. 25 | Defaults to 'w'. 26 | 27 | Returns: 28 | logging.Logger: The expected logger. 29 | """ 30 | logger = logging.getLogger(name) 31 | if name in logger_initialized: 32 | return logger 33 | # handle hierarchical names 34 | # e.g., logger "a" is initialized, then logger "a.b" will skip the 35 | # initialization since it is a child of "a". 36 | for logger_name in logger_initialized: 37 | if name.startswith(logger_name): 38 | return logger 39 | 40 | # handle duplicate logs to the console 41 | # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) 42 | # to the root logger. As logger.propagate is True by default, this root 43 | # level handler causes logging messages from rank>0 processes to 44 | # unexpectedly show up on the console, creating much unwanted clutter. 45 | # To fix this issue, we set the root logger's StreamHandler, if any, to log 46 | # at the ERROR level. 47 | for handler in logger.root.handlers: 48 | if type(handler) is logging.StreamHandler: 49 | handler.setLevel(logging.ERROR) 50 | 51 | stream_handler = logging.StreamHandler() 52 | handlers = [stream_handler] 53 | 54 | if dist.is_available() and dist.is_initialized(): 55 | rank = dist.get_rank() 56 | else: 57 | rank = 0 58 | 59 | # only rank 0 will add a FileHandler 60 | if rank == 0 and log_file is not None: 61 | # Here, the default behaviour of the official logger is 'a'. Thus, we 62 | # provide an interface to change the file mode to the default 63 | # behaviour. 64 | file_handler = logging.FileHandler(log_file, file_mode) 65 | handlers.append(file_handler) 66 | 67 | formatter = logging.Formatter( 68 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 69 | for handler in handlers: 70 | handler.setFormatter(formatter) 71 | handler.setLevel(log_level) 72 | logger.addHandler(handler) 73 | 74 | if rank == 0: 75 | logger.setLevel(log_level) 76 | else: 77 | logger.setLevel(logging.ERROR) 78 | 79 | logger_initialized[name] = True 80 | 81 | return logger 82 | 83 | 84 | def print_log(msg, logger=None, level=logging.INFO): 85 | """Print a log message. 86 | 87 | Args: 88 | msg (str): The message to be logged. 89 | logger (logging.Logger | str | None): The logger to be used. 90 | Some special loggers are: 91 | 92 | - "silent": no message will be printed. 93 | - other str: the logger obtained with `get_root_logger(logger)`. 94 | - None: The `print()` method will be used to print log messages. 95 | level (int): Logging level. Only available when `logger` is a Logger 96 | object or "root". 97 | """ 98 | if logger is None: 99 | print(msg) 100 | elif isinstance(logger, logging.Logger): 101 | logger.log(level, msg) 102 | elif logger == 'silent': 103 | pass 104 | elif isinstance(logger, str): 105 | _logger = get_logger(logger) 106 | _logger.log(level, msg) 107 | else: 108 | raise TypeError( 109 | 'logger should be either a logging.Logger object, str, ' 110 | f'"silent" or None, but got {type(logger)}') 111 | 112 | 113 | def get_root_logger(log_file=None, log_level=logging.INFO): 114 | """Get root logger. 115 | 116 | Args: 117 | log_file (str, optional): File path of log. Defaults to None. 118 | log_level (int, optional): The level of logger. 119 | Defaults to logging.INFO. 120 | 121 | Returns: 122 | :obj:`logging.Logger`: The obtained logger 123 | """ 124 | logger = get_logger(name='chatllms', 125 | log_file=log_file, 126 | log_level=log_level) 127 | 128 | return logger 129 | -------------------------------------------------------------------------------- /source/model/llama2/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import pathlib 4 | from typing import Tuple 5 | 6 | import torch 7 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 8 | HfArgumentParser, PreTrainedModel, 9 | PreTrainedTokenizer, Trainer) 10 | 11 | from chatllms.configs import DataArguments, ModelArguments, TrainingArguments 12 | from chatllms.data import make_supervised_data_module 13 | from chatllms.utils.model_utils import (add_special_tokens_if_missing, 14 | safe_save_model_for_hf_trainer) 15 | 16 | 17 | def load_model_tokenizer(args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]: 18 | """ 19 | Load a pre-trained model and tokenizer for natural language processing tasks. 20 | 21 | Args: 22 | args: An object containing the input arguments. 23 | 24 | Returns: 25 | A tuple containing the loaded model and tokenizer. 26 | """ 27 | # Determine the torch data type based on the input arguments 28 | torch_dtype = torch.float16 if args.fp16 else ( 29 | torch.bfloat16 if args.bf16 else torch.float32) 30 | 31 | config_kwargs = { 32 | 'cache_dir': args.cache_dir, 33 | 'use_auth_token': args.use_auth_token, 34 | 'trust_remote_code': args.trust_remote_code, 35 | } 36 | 37 | # Load the pre-trained model 38 | print(f'Loading Model from {args.model_name_or_path}...') 39 | model = AutoModelForCausalLM.from_pretrained( 40 | args.model_name_or_path, 41 | torch_dtype=torch_dtype, 42 | **config_kwargs, 43 | ) 44 | 45 | # Enable model parallelism 46 | setattr(model, 'model_parallel', True) 47 | setattr(model, 'is_parallelizable', True) 48 | 49 | if args.gradient_checkpointing: 50 | logging.warning('Using gradient checkpointing...') 51 | model.enable_input_require_grads() 52 | model.config.use_cache = False # Turn off when gradient checkpointing is enabled 53 | 54 | # Load the tokenizer 55 | print(f'Loading tokenizer from {args.model_name_or_path}...') 56 | tokenizer = AutoTokenizer.from_pretrained( 57 | args.model_name_or_path, 58 | padding_side='right', 59 | model_max_length=args.model_max_length, 60 | use_fast=False, 61 | tokenizer_type='llama' if 'llama' in args.model_name_or_path else None, 62 | **config_kwargs, 63 | ) 64 | 65 | return model, tokenizer 66 | 67 | 68 | def train() -> None: 69 | """ 70 | Trains a language model using Hugging Face's Transformers library. 71 | 72 | Args: 73 | model_args (ModelArguments): The arguments for the model configuration. 74 | data_args (DataArguments): The arguments for the data configuration. 75 | training_args (TrainingArguments): The arguments for the training configuration. 76 | 77 | Returns: 78 | None 79 | 80 | """ 81 | parser = HfArgumentParser( 82 | (ModelArguments, DataArguments, TrainingArguments)) 83 | (model_args, data_args, 84 | training_args) = parser.parse_args_into_dataclasses() 85 | data_args.init_for_training() 86 | args = argparse.Namespace(**vars(model_args), **vars(data_args), 87 | **vars(training_args)) 88 | # load model and tokenizer 89 | logging.warning('Loading model and tokenizer...') 90 | model, tokenizer = load_model_tokenizer(args=args) 91 | logging.warning('Successfully loaded model and tokenizer.') 92 | 93 | if 'llama' in args.model_name_or_path or 'baichuan' in args.model_name_or_path: 94 | logging.warning( 95 | f'Adding special tokens for {args.model_name_or_path}.') 96 | add_special_tokens_if_missing(tokenizer, model) 97 | 98 | if 'baichuan' in args.model_name_or_path: 99 | # Tie the weights 100 | model.tie_weights() 101 | 102 | # Create a supervised dataset and Trainer, then train the model 103 | logging.warning('Creating a supervised dataset and DataCollator...') 104 | data_module = make_supervised_data_module(tokenizer=tokenizer, args=args) 105 | 106 | # Initialize the Trainer object and start training 107 | logging.warning('Initializing Trainer object.') 108 | trainer = Trainer( 109 | model=model, 110 | tokenizer=tokenizer, 111 | args=training_args, 112 | **data_module, 113 | ) 114 | 115 | logging.warning('Start Training...') 116 | if list(pathlib.Path(training_args.output_dir).glob('checkpoint-*')): 117 | trainer.train(resume_from_checkpoint=True) 118 | else: 119 | trainer.train() 120 | 121 | logging.warning(f'Saving Model to {training_args.output_dir}') 122 | trainer.save_state() 123 | # Save the trained model 124 | safe_save_model_for_hf_trainer(trainer=trainer, 125 | output_dir=training_args.output_dir) 126 | 127 | logging.warning('Done.') 128 | 129 | 130 | if __name__ == '__main__': 131 | train() 132 | -------------------------------------------------------------------------------- /retrieval_contriever/generate_passage_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | 9 | import argparse 10 | import csv 11 | import logging 12 | import pickle 13 | 14 | import numpy as np 15 | import torch 16 | 17 | import transformers 18 | 19 | import src.slurm 20 | import src.contriever 21 | import src.utils 22 | import src.data 23 | import src.normalize_text 24 | 25 | 26 | def embed_passages(args, passages, model, tokenizer): 27 | total = 0 28 | allids, allembeddings = [], [] 29 | batch_ids, batch_text = [], [] 30 | with torch.no_grad(): 31 | for k, p in enumerate(passages): 32 | batch_ids.append(p["id"]) 33 | if args.no_title or not "title" in p: 34 | text = p["text"] 35 | else: 36 | text = p["title"] + " " + p["text"] 37 | if args.lowercase: 38 | text = text.lower() 39 | if args.normalize_text: 40 | text = src.normalize_text.normalize(text) 41 | batch_text.append(text) 42 | 43 | if len(batch_text) == args.per_gpu_batch_size or k == len(passages) - 1: 44 | 45 | encoded_batch = tokenizer.batch_encode_plus( 46 | batch_text, 47 | return_tensors="pt", 48 | max_length=args.passage_maxlength, 49 | padding=True, 50 | truncation=True, 51 | ) 52 | 53 | encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()} 54 | embeddings = model(**encoded_batch) 55 | 56 | embeddings = embeddings.cpu() 57 | total += len(batch_ids) 58 | allids.extend(batch_ids) 59 | allembeddings.append(embeddings) 60 | 61 | batch_text = [] 62 | batch_ids = [] 63 | if k % 100000 == 0 and k > 0: 64 | print(f"Encoded passages {total}") 65 | 66 | allembeddings = torch.cat(allembeddings, dim=0).numpy() 67 | return allids, allembeddings 68 | 69 | 70 | def main(args): 71 | model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path) 72 | print(f"Model loaded from {args.model_name_or_path}.", flush=True) 73 | model.eval() 74 | model = model.cuda() 75 | if not args.no_fp16: 76 | model = model.half() 77 | 78 | passages = src.data.load_passages(args.passages) 79 | 80 | shard_size = len(passages) // args.num_shards 81 | start_idx = args.shard_id * shard_size 82 | end_idx = start_idx + shard_size 83 | if args.shard_id == args.num_shards - 1: 84 | end_idx = len(passages) 85 | 86 | passages = passages[start_idx:end_idx] 87 | print(f"Embedding generation for {len(passages)} passages from idx {start_idx} to {end_idx}.") 88 | 89 | allids, allembeddings = embed_passages(args, passages, model, tokenizer) 90 | 91 | save_file = os.path.join(args.output_dir, args.prefix + f"_{args.shard_id:02d}") 92 | os.makedirs(args.output_dir, exist_ok=True) 93 | print(f"Saving {len(allids)} passage embeddings to {save_file}.") 94 | with open(save_file, mode="wb") as f: 95 | pickle.dump((allids, allembeddings), f) 96 | 97 | print(f"Total passages processed {len(allids)}. Written to {save_file}.") 98 | 99 | 100 | if __name__ == "__main__": 101 | parser = argparse.ArgumentParser() 102 | 103 | parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)") 104 | parser.add_argument("--output_dir", type=str, default="wikipedia_embeddings", help="dir path to save embeddings") 105 | parser.add_argument("--prefix", type=str, default="passages", help="prefix path to save embeddings") 106 | parser.add_argument("--shard_id", type=int, default=0, help="Id of the current shard") 107 | parser.add_argument("--num_shards", type=int, default=1, help="Total number of shards") 108 | parser.add_argument( 109 | "--per_gpu_batch_size", type=int, default=512, help="Batch size for the passage encoder forward pass" 110 | ) 111 | parser.add_argument("--passage_maxlength", type=int, default=512, help="Maximum number of tokens in a passage") 112 | parser.add_argument( 113 | "--model_name_or_path", type=str, help="path to directory containing model weights and config file" 114 | ) 115 | parser.add_argument("--no_fp16", action="store_true", help="inference in fp32") 116 | parser.add_argument("--no_title", action="store_true", help="title not added to the passage body") 117 | parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding") 118 | parser.add_argument("--normalize_text", action="store_true", help="lowercase text before encoding") 119 | 120 | args = parser.parse_args() 121 | 122 | src.slurm.init_distributed_mode(args) 123 | 124 | main(args) 125 | -------------------------------------------------------------------------------- /source/model/llama2/cli_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | from threading import Thread 4 | from typing import List, Tuple 5 | 6 | import torch 7 | import transformers 8 | from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, 9 | PreTrainedTokenizer, TextIteratorStreamer) 10 | 11 | from chatllms.configs import GenerationArguments, ModelInferenceArguments 12 | from chatllms.utils.model_utils import get_logits_processor 13 | from chatllms.utils.template import PromptTemplate 14 | 15 | 16 | def generate_response( 17 | query: str, 18 | history: List[Tuple[str, str]], 19 | prefix: str, 20 | prompt_template: PromptTemplate, 21 | tokenizer: PreTrainedTokenizer, 22 | model: PreTrainedModel, 23 | generation_args: dict, 24 | ) -> List[str]: 25 | """ 26 | Generates a response to the given query using GPT-3.5 model and prints it to the console. 27 | 28 | Args: 29 | query (str): The input query for which a response is to be generated. 30 | history (List[Tuple[str, str]]): A list of previous queries and their responses. 31 | prefix (str): The prefix string added to the beginning of each input sequence. 32 | prompt_template (PromptTemplate): The prompt template used to generate the input sequence to the model. 33 | tokenizer (PreTrainedTokenizer): The tokenizer used to convert the raw text into input tokens. 34 | model (PreTrainedModel): The GPT-3.5 model used to generate the response. 35 | generation_args (dict): A dictionary containing the arguments to be passed to the generate() method of the model. 36 | 37 | Returns: 38 | List[Tuple[str, str]]: A list of all the previous queries and their responses, including the current one. 39 | """ 40 | 41 | # Convert the query and history into input IDs 42 | input_text = prompt_template.get_prompt(query, history, prefix) 43 | inputs = tokenizer(input_text, return_tensors='pt') 44 | inputs = {k: v.to(model.device) for k, v in inputs.items()} 45 | 46 | # Create a TextIteratorStreamer object to stream the response from the model 47 | streamer = TextIteratorStreamer(tokenizer, 48 | timeout=60.0, 49 | skip_prompt=True, 50 | skip_special_tokens=True) 51 | 52 | # Set the arguments for the model's generate() method 53 | gen_kwargs = dict( 54 | inputs, 55 | streamer=streamer, 56 | logits_processor=get_logits_processor(), 57 | **generation_args.to_dict(), 58 | ) 59 | 60 | # Start a separate thread to generate the response asynchronously 61 | thread = Thread(target=model.generate, kwargs=gen_kwargs) 62 | thread.start() 63 | 64 | # Print the model name and the response as it is generated 65 | print('Assistant: ', end='', flush=True) 66 | response = '' 67 | for new_text in streamer: 68 | print(new_text, end='', flush=True) 69 | response += new_text 70 | print() 71 | 72 | # Update the history with the current query and response and return it 73 | history.append((query, response)) 74 | return history 75 | 76 | 77 | def main(): 78 | os_name = platform.system() 79 | clear_command = 'cls' if os_name == 'Windows' else 'clear' 80 | 81 | # Parse command-line arguments 82 | parser = transformers.HfArgumentParser( 83 | (ModelInferenceArguments, GenerationArguments)) 84 | model_server_args, generation_args = parser.parse_args_into_dataclasses() 85 | 86 | # Load the model and tokenizer 87 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 88 | 89 | model = AutoModelForCausalLM.from_pretrained( 90 | model_server_args.model_name_or_path, 91 | trust_remote_code=True, 92 | low_cpu_mem_usage=True, 93 | torch_dtype=torch.float16, 94 | device_map='auto').to(device).eval() 95 | 96 | tokenizer = AutoTokenizer.from_pretrained( 97 | model_server_args.model_name_or_path, 98 | trust_remote_code=True, 99 | use_fast=False, 100 | ) 101 | 102 | prompt_template = PromptTemplate(model_server_args.prompt_template) 103 | prefix = model_server_args.source_prefix if model_server_args.source_prefix else '' 104 | history: List[str] = [] 105 | print('欢迎使用 CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序') 106 | while True: 107 | try: 108 | query = input('\nUser: ') 109 | except UnicodeDecodeError: 110 | print( 111 | 'Detected decoding error at the inputs, please set the terminal encoding to utf-8.' 112 | ) 113 | continue 114 | if query.strip() == 'stop': 115 | break 116 | 117 | if query.strip() == 'clear': 118 | # Clear the conversation history 119 | history = [] 120 | os.system(clear_command) 121 | print('欢迎使用 CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序') 122 | continue 123 | 124 | # Perform prediction and printing 125 | history = generate_response(query, history, prefix, prompt_template, 126 | tokenizer, model, generation_args) 127 | 128 | 129 | if __name__ == '__main__': 130 | main() 131 | -------------------------------------------------------------------------------- /retrieval_contriever/src/normalize_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | adapted from chemdataextractor.text.normalize 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | Tools for normalizing text. 5 | https://github.com/mcs07/ChemDataExtractor 6 | :copyright: Copyright 2016 by Matt Swain. 7 | :license: MIT 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining 10 | a copy of this software and associated documentation files (the 11 | 'Software'), to deal in the Software without restriction, including 12 | without limitation the rights to use, copy, modify, merge, publish, 13 | distribute, sublicense, and/or sell copies of the Software, and to 14 | permit persons to whom the Software is furnished to do so, subject to 15 | the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be 18 | included in all copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 25 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 26 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | """ 28 | 29 | #: Control characters. 30 | CONTROLS = { 31 | '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011', 32 | '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b', 33 | } 34 | # There are further control characters, but they are instead replaced with a space by unicode normalization 35 | # '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f' 36 | 37 | 38 | #: Hyphen and dash characters. 39 | HYPHENS = { 40 | '-', # \u002d Hyphen-minus 41 | '‐', # \u2010 Hyphen 42 | '‑', # \u2011 Non-breaking hyphen 43 | '⁃', # \u2043 Hyphen bullet 44 | '‒', # \u2012 figure dash 45 | '–', # \u2013 en dash 46 | '—', # \u2014 em dash 47 | '―', # \u2015 horizontal bar 48 | } 49 | 50 | #: Minus characters. 51 | MINUSES = { 52 | '-', # \u002d Hyphen-minus 53 | '−', # \u2212 Minus 54 | '-', # \uff0d Full-width Hyphen-minus 55 | '⁻', # \u207b Superscript minus 56 | } 57 | 58 | #: Plus characters. 59 | PLUSES = { 60 | '+', # \u002b Plus 61 | '+', # \uff0b Full-width Plus 62 | '⁺', # \u207a Superscript plus 63 | } 64 | 65 | #: Slash characters. 66 | SLASHES = { 67 | '/', # \u002f Solidus 68 | '⁄', # \u2044 Fraction slash 69 | '∕', # \u2215 Division slash 70 | } 71 | 72 | #: Tilde characters. 73 | TILDES = { 74 | '~', # \u007e Tilde 75 | '˜', # \u02dc Small tilde 76 | '⁓', # \u2053 Swung dash 77 | '∼', # \u223c Tilde operator #in mbert vocab 78 | '∽', # \u223d Reversed tilde 79 | '∿', # \u223f Sine wave 80 | '〜', # \u301c Wave dash #in mbert vocab 81 | '~', # \uff5e Full-width tilde #in mbert vocab 82 | } 83 | 84 | #: Apostrophe characters. 85 | APOSTROPHES = { 86 | "'", # \u0027 87 | '’', # \u2019 88 | '՚', # \u055a 89 | 'Ꞌ', # \ua78b 90 | 'ꞌ', # \ua78c 91 | ''', # \uff07 92 | } 93 | 94 | #: Single quote characters. 95 | SINGLE_QUOTES = { 96 | "'", # \u0027 97 | '‘', # \u2018 98 | '’', # \u2019 99 | '‚', # \u201a 100 | '‛', # \u201b 101 | 102 | } 103 | 104 | #: Double quote characters. 105 | DOUBLE_QUOTES = { 106 | '"', # \u0022 107 | '“', # \u201c 108 | '”', # \u201d 109 | '„', # \u201e 110 | '‟', # \u201f 111 | } 112 | 113 | #: Accent characters. 114 | ACCENTS = { 115 | '`', # \u0060 116 | '´', # \u00b4 117 | } 118 | 119 | #: Prime characters. 120 | PRIMES = { 121 | '′', # \u2032 122 | '″', # \u2033 123 | '‴', # \u2034 124 | '‵', # \u2035 125 | '‶', # \u2036 126 | '‷', # \u2037 127 | '⁗', # \u2057 128 | } 129 | 130 | #: Quote characters, including apostrophes, single quotes, double quotes, accents and primes. 131 | QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES 132 | 133 | def normalize(text): 134 | for control in CONTROLS: 135 | text = text.replace(control, '') 136 | text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ') 137 | 138 | for hyphen in HYPHENS | MINUSES: 139 | text = text.replace(hyphen, '-') 140 | text = text.replace('\u00ad', '') 141 | 142 | for double_quote in DOUBLE_QUOTES: 143 | text = text.replace(double_quote, '"') # \u0022 144 | for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS): 145 | text = text.replace(single_quote, "'") # \u0027 146 | text = text.replace('′', "'") # \u2032 prime 147 | text = text.replace('‵', "'") # \u2035 reversed prime 148 | text = text.replace('″', "''") # \u2033 double prime 149 | text = text.replace('‶', "''") # \u2036 reversed double prime 150 | text = text.replace('‴', "'''") # \u2034 triple prime 151 | text = text.replace('‷', "'''") # \u2037 reversed triple prime 152 | text = text.replace('⁗', "''''") # \u2057 quadruple prime 153 | 154 | text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026 155 | 156 | for slash in SLASHES: 157 | text = text.replace(slash, '/') 158 | 159 | #for tilde in TILDES: 160 | # text = text.replace(tilde, '~') 161 | 162 | return text 163 | -------------------------------------------------------------------------------- /retrieval_contriever/src/moco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.nn as nn 5 | import logging 6 | import copy 7 | import transformers 8 | 9 | from src import contriever, dist_utils, utils 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class MoCo(nn.Module): 15 | def __init__(self, opt): 16 | super(MoCo, self).__init__() 17 | 18 | self.queue_size = opt.queue_size 19 | self.momentum = opt.momentum 20 | self.temperature = opt.temperature 21 | self.label_smoothing = opt.label_smoothing 22 | self.norm_doc = opt.norm_doc 23 | self.norm_query = opt.norm_query 24 | self.moco_train_mode_encoder_k = opt.moco_train_mode_encoder_k # apply the encoder on keys in train mode 25 | 26 | retriever, tokenizer = self._load_retriever( 27 | opt.retriever_model_id, pooling=opt.pooling, random_init=opt.random_init 28 | ) 29 | 30 | self.tokenizer = tokenizer 31 | self.encoder_q = retriever 32 | self.encoder_k = copy.deepcopy(retriever) 33 | 34 | for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): 35 | param_k.data.copy_(param_q.data) 36 | param_k.requires_grad = False 37 | 38 | # create the queue 39 | self.register_buffer("queue", torch.randn(opt.projection_size, self.queue_size)) 40 | self.queue = nn.functional.normalize(self.queue, dim=0) 41 | 42 | self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) 43 | 44 | def _load_retriever(self, model_id, pooling, random_init): 45 | cfg = utils.load_hf(transformers.AutoConfig, model_id) 46 | tokenizer = utils.load_hf(transformers.AutoTokenizer, model_id) 47 | 48 | if "xlm" in model_id: 49 | model_class = contriever.XLMRetriever 50 | else: 51 | model_class = contriever.Contriever 52 | 53 | if random_init: 54 | retriever = model_class(cfg) 55 | else: 56 | retriever = utils.load_hf(model_class, model_id) 57 | 58 | if "bert-" in model_id: 59 | if tokenizer.bos_token_id is None: 60 | tokenizer.bos_token = "[CLS]" 61 | if tokenizer.eos_token_id is None: 62 | tokenizer.eos_token = "[SEP]" 63 | 64 | retriever.config.pooling = pooling 65 | 66 | return retriever, tokenizer 67 | 68 | def get_encoder(self, return_encoder_k=False): 69 | if return_encoder_k: 70 | return self.encoder_k 71 | else: 72 | return self.encoder_q 73 | 74 | def _momentum_update_key_encoder(self): 75 | """ 76 | Update of the key encoder 77 | """ 78 | for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): 79 | param_k.data = param_k.data * self.momentum + param_q.data * (1.0 - self.momentum) 80 | 81 | @torch.no_grad() 82 | def _dequeue_and_enqueue(self, keys): 83 | # gather keys before updating queue 84 | keys = dist_utils.gather_nograd(keys.contiguous()) 85 | 86 | batch_size = keys.shape[0] 87 | 88 | ptr = int(self.queue_ptr) 89 | assert self.queue_size % batch_size == 0, f"{batch_size}, {self.queue_size}" # for simplicity 90 | 91 | # replace the keys at ptr (dequeue and enqueue) 92 | self.queue[:, ptr : ptr + batch_size] = keys.T 93 | ptr = (ptr + batch_size) % self.queue_size # move pointer 94 | 95 | self.queue_ptr[0] = ptr 96 | 97 | def _compute_logits(self, q, k): 98 | l_pos = torch.einsum("nc,nc->n", [q, k]).unsqueeze(-1) 99 | l_neg = torch.einsum("nc,ck->nk", [q, self.queue.clone().detach()]) 100 | 101 | logits = torch.cat([l_pos, l_neg], dim=1) 102 | return logits 103 | 104 | def forward(self, q_tokens, q_mask, k_tokens, k_mask, stats_prefix="", iter_stats={}, **kwargs): 105 | bsz = q_tokens.size(0) 106 | 107 | q = self.encoder_q(input_ids=q_tokens, attention_mask=q_mask, normalize=self.norm_query) 108 | 109 | # compute key features 110 | with torch.no_grad(): # no gradient to keys 111 | self._momentum_update_key_encoder() # update the key encoder 112 | 113 | if not self.encoder_k.training and not self.moco_train_mode_encoder_k: 114 | self.encoder_k.eval() 115 | 116 | k = self.encoder_k(input_ids=k_tokens, attention_mask=k_mask, normalize=self.norm_doc) 117 | 118 | logits = self._compute_logits(q, k) / self.temperature 119 | 120 | # labels: positive key indicators 121 | labels = torch.zeros(bsz, dtype=torch.long).cuda() 122 | 123 | loss = torch.nn.functional.cross_entropy(logits, labels, label_smoothing=self.label_smoothing) 124 | 125 | self._dequeue_and_enqueue(k) 126 | 127 | # log stats 128 | if len(stats_prefix) > 0: 129 | stats_prefix = stats_prefix + "/" 130 | iter_stats[f"{stats_prefix}loss"] = (loss.item(), bsz) 131 | 132 | predicted_idx = torch.argmax(logits, dim=-1) 133 | accuracy = 100 * (predicted_idx == labels).float().mean() 134 | stdq = torch.std(q, dim=0).mean().item() 135 | stdk = torch.std(k, dim=0).mean().item() 136 | iter_stats[f"{stats_prefix}accuracy"] = (accuracy, bsz) 137 | iter_stats[f"{stats_prefix}stdq"] = (stdq, bsz) 138 | iter_stats[f"{stats_prefix}stdk"] = (stdk, bsz) 139 | 140 | return loss, iter_stats 141 | -------------------------------------------------------------------------------- /source/model/llama2/examples/format_data/convert_alpaca.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datasets import load_dataset 4 | 5 | 6 | def json_dump(obj, path): 7 | with open(path, 'w', encoding='utf-8') as f: 8 | json.dump(obj, f, indent=2, ensure_ascii=False) 9 | 10 | 11 | def json_load(in_file): 12 | with open(in_file, 'r') as f: 13 | json_data = json.load(f) 14 | return json_data 15 | 16 | 17 | def convert_100PoisonMpts(in_file, out_file): 18 | raw_data = load_dataset('json', data_files=in_file)['train'] 19 | new_content = [] 20 | for i, raw_text in enumerate(raw_data): 21 | prompt = raw_text['prompt'] 22 | response = raw_text['answer'] 23 | if len(prompt) <= 5 or len(response) <= 5: 24 | continue 25 | new_content.append({ 26 | 'instruction': prompt, 27 | 'input': '', 28 | 'output': response, 29 | }) 30 | 31 | print(f'#out: {len(new_content)}') 32 | json_dump(new_content, out_file) 33 | 34 | 35 | def convert_Cvalues(in_file, out_file): 36 | raw_data = load_dataset('json', data_files=in_file)['train'] 37 | new_content = [] 38 | for i, raw_text in enumerate(raw_data): 39 | prompt = raw_text['prompt'] 40 | response = raw_text['pos_resp'] 41 | if len(prompt) <= 5 or len(response) <= 5: 42 | continue 43 | new_content.append({ 44 | 'instruction': prompt, 45 | 'input': '', 46 | 'output': response, 47 | }) 48 | 49 | print(f'#out: {len(new_content)}') 50 | json_dump(new_content, out_file) 51 | 52 | 53 | def convert_huatuogpt(in_file, out_file): 54 | raw_data = load_dataset('json', data_files=in_file)['train'] 55 | new_content = [] 56 | for i, raw_text in enumerate(raw_data): 57 | data = raw_text['data'] 58 | prompt = data[0].replace('问:', '') 59 | response = data[1].replace('答:', '') 60 | if len(prompt) <= 5 or len(response) <= 5: 61 | continue 62 | new_content.append({ 63 | 'instruction': prompt, 64 | 'input': '', 65 | 'output': response, 66 | }) 67 | print(f'#out: {len(new_content)}') 68 | json_dump(new_content, out_file) 69 | 70 | 71 | def convert_safety_attack(in_file, out_file): 72 | field_list = [ 73 | 'Reverse_Exposure', 'Goal_Hijacking', 'Prompt_Leaking', 74 | 'Unsafe_Instruction_Topic', 'Role_Play_Instruction', 75 | 'Inquiry_With_Unsafe_Opinion' 76 | ] 77 | new_content = [] 78 | for filed in field_list: 79 | raw_data = load_dataset('json', field=filed, 80 | data_files=in_file)['train'] 81 | for i, raw_text in enumerate(raw_data): 82 | prompt = raw_text['prompt'] 83 | response = raw_text['response'] 84 | if len(prompt) <= 5 or len(response) <= 5: 85 | continue 86 | new_content.append({ 87 | 'instruction': prompt, 88 | 'input': '', 89 | 'output': response, 90 | }) 91 | print(f'#out: {len(new_content)}') 92 | json_dump(new_content, out_file) 93 | 94 | 95 | def convert_safety_scenarios(in_file, out_file): 96 | 97 | field_list = [ 98 | 'Unfairness_And_Discrimination', 'Crimes_And_Illegal_Activities', 99 | 'Insult', 'Mental_Health', 'Physical_Harm', 'Privacy_And_Property', 100 | 'Ethics_And_Morality' 101 | ] 102 | new_content = [] 103 | for filed in field_list: 104 | raw_data = load_dataset('json', data_files=in_file, 105 | field=filed)['train'] 106 | for i, raw_text in enumerate(raw_data): 107 | prompt = raw_text['prompt'] 108 | response = raw_text['response'] 109 | if len(prompt) <= 5 or len(response) <= 5: 110 | continue 111 | new_content.append({ 112 | 'instruction': prompt, 113 | 'input': '', 114 | 'output': response, 115 | }) 116 | print(f'#out: {len(new_content)}') 117 | json_dump(new_content, out_file) 118 | 119 | 120 | if __name__ == '__main__': 121 | 122 | data_path = '/home/robin/prompt_data/100PoisonMpts/train.jsonl' 123 | out_path = '/home/robin/prompt_data/100PoisonMpts/train_alpaca.jsonl' 124 | convert_100PoisonMpts(data_path, out_file=out_path) 125 | 126 | data_path = '/home/robin/prompt_data/CValues-Comparison/test.jsonl' 127 | out_path = '/home/robin/prompt_data/CValues-Comparison/test_alpaca.json' 128 | convert_Cvalues(data_path, out_file=out_path) 129 | 130 | data_path = '/home/robin/prompt_data/CValues-Comparison/train.jsonl' 131 | out_path = '/home/robin/prompt_data/CValues-Comparison/train_alpaca.json' 132 | convert_Cvalues(data_path, out_file=out_path) 133 | 134 | data_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_sft_data_v1.jsonl' 135 | out_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json' 136 | convert_huatuogpt(data_path, out_file=out_path) 137 | 138 | data_path = '/home/robin/prompt_data/Safety-Prompts/instruction_attack_scenarios.json' 139 | out_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json' 140 | convert_safety_attack(data_path, out_file=out_path) 141 | 142 | data_path = '/home/robin/prompt_data/Safety-Prompts/typical_safety_scenarios.json' 143 | out_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json' 144 | convert_safety_scenarios(data_path, out_file=out_path) 145 | -------------------------------------------------------------------------------- /source/model/llama2/data/dataset_info.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | 4 | def get_dataset_info(dataset_dir): 5 | """ 6 | Returns the datasets info to a dataset based on a pre-defined map of dataset names to their corresponding URLs on the internet 7 | or local file paths. 8 | 9 | Args: 10 | dataset_dir (str): The local directory where the dataset is stored; this is used for datasets that are stored locally. 11 | 12 | Returns: 13 | str: The dataset dict to the specified dataset. 14 | """ 15 | dataset_info = { 16 | 'alpaca': { 17 | 'hf_hub_url': 'tatsu-lab/alpaca', 18 | 'local_path': 'tatsu-lab/alpaca/alpaca.json', 19 | 'multi_turn': False 20 | }, 21 | 'alpaca-clean': { 22 | 'hf_hub_url': 'yahma/alpaca-cleaned', 23 | 'local_path': '', 24 | 'multi_turn': False 25 | }, 26 | 'chip2': { 27 | 'hf_hub_url': 'laion/OIG', 28 | 'local_path': '', 29 | 'multi_turn': False 30 | }, 31 | 'self-instruct': { 32 | 'hf_hub_url': 'yizhongw/self_instruct', 33 | 'local_path': '', 34 | 'multi_turn': False 35 | }, 36 | 'guanaco': { 37 | 'hf_hub_url': 'JosephusCheung/GuanacoDataset', 38 | 'local_path': '', 39 | 'multi_turn': False 40 | }, 41 | 'hh-rlhf': { 42 | 'hf_hub_url': 'Anthropic/hh-rlhf', 43 | 'local_path': '', 44 | 'multi_turn': False 45 | }, 46 | 'longformer': { 47 | 'hf_hub_url': 'akoksal/LongForm', 48 | 'local_path': '', 49 | 'multi_turn': False 50 | }, 51 | 'openassistant-guanaco': { 52 | 'hf_hub_url': 53 | 'timdettmers/openassistant-guanaco', 54 | 'local_path': 55 | join(dataset_dir, 56 | 'timdettmers/openassistant_best_replies_train.jsonl'), 57 | 'multi_turn': 58 | False 59 | }, 60 | 'evol_instruct': { 61 | 'hf_hub_url': 62 | 'WizardLM/WizardLM_evol_instruct_V2_196k', 63 | 'local_path': 64 | join(dataset_dir, 'WizardLM/WizardLM_evol_instruct_V2_143k.json'), 65 | 'multi_turn': 66 | False 67 | }, 68 | 'dolly-15k': { 69 | 'hf_hub_url': 'databricks/databricks-dolly-15k', 70 | 'local_path': join(dataset_dir, 'databricks/databricks-dolly-15k'), 71 | 'multi_turn': False 72 | }, 73 | 'olcc': { 74 | 'hf_hub_url': 'yizhongw/olcc', 75 | 'local_path': join(dataset_dir, 'olcc/olcc_alpaca.json'), 76 | 'multi_turn': False 77 | }, 78 | 'share_gpt': { 79 | 'hf_hub_url': '', 80 | 'local_path': join(dataset_dir, 'sharegpt/sharegpt_split.json'), 81 | 'multi_turn': True 82 | }, 83 | '100PoisonMpts': { 84 | 'hf_hub_url': '', 85 | 'local_path': join(dataset_dir, '100PoisonMpts/train.jsonl'), 86 | 'multi_turn': False 87 | }, 88 | 'belle_0.5m': { 89 | 'hf_hub_url': 'BelleGroup/train_0.5M_CN', 90 | 'local_path': '', 91 | 'multi_turn': False 92 | }, 93 | 'belle_1m': { 94 | 'hf_hub_url': 'BelleGroup/train_1M_CN', 95 | 'local_path': '', 96 | 'multi_turn': False 97 | }, 98 | 'belle_2m': { 99 | 'hf_hub_url': 'BelleGroup/train_2M_CN', 100 | 'local_path': '', 101 | 'multi_turn': False 102 | }, 103 | 'belle_dialog': { 104 | 'hf_hub_url': 'BelleGroup/generated_chat_0.4M', 105 | 'local_path': '', 106 | 'multi_turn': False 107 | }, 108 | 'belle_math': { 109 | 'hf_hub_url': 'BelleGroup/school_math_0.25M', 110 | 'local_path': '', 111 | 'multi_turn': False 112 | }, 113 | 'belle_multiturn': { 114 | 'hf_hub_url': 'BelleGroup/multi_turn_0.5M', 115 | 'local_path': '', 116 | 'multi_turn': True, 117 | 'columns': { 118 | 'prompt': 'instruction', 119 | 'query': '', 120 | 'response': 'output', 121 | 'history': 'history' 122 | } 123 | }, 124 | 'firefly': { 125 | 'hf_hub_url': 'YeungNLP/firefly-train-1.1M', 126 | 'local_path': '', 127 | 'multi_turn': False, 128 | 'columns': { 129 | 'prompt': 'input', 130 | 'query': '', 131 | 'response': 'target', 132 | 'history': '' 133 | } 134 | }, 135 | 'codealpaca': { 136 | 'hf_hub_url': 'sahil2801/CodeAlpaca-20k', 137 | 'local_path': '', 138 | 'multi_turn': False 139 | }, 140 | 'alpaca_cot': { 141 | 'hf_hub_url': 'QingyiSi/Alpaca-CoT', 142 | 'local_path': '', 143 | 'multi_turn': False 144 | }, 145 | 'webqa': { 146 | 'hf_hub_url': 'suolyer/webqa', 147 | 'local_path': '', 148 | 'multi_turn': False, 149 | 'columns': { 150 | 'prompt': 'input', 151 | 'query': '', 152 | 'response': 'output', 153 | 'history': '' 154 | } 155 | }, 156 | 'novel_tokens512_50k': { 157 | 'hf_hub_url': 'zxbsmk/webnovel_cn', 158 | 'local_path': '', 159 | 'multi_turn': False 160 | } 161 | } 162 | 163 | return dataset_info 164 | -------------------------------------------------------------------------------- /retrieval_contriever/src/contriever.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import os 4 | import torch 5 | import transformers 6 | from transformers import BertModel, XLMRobertaModel 7 | 8 | from retrieval_contriever.src import utils 9 | 10 | 11 | class Contriever(BertModel): 12 | def __init__(self, config, pooling="average", **kwargs): 13 | super().__init__(config, add_pooling_layer=False) 14 | if not hasattr(config, "pooling"): 15 | self.config.pooling = pooling 16 | 17 | def forward( 18 | self, 19 | input_ids=None, 20 | attention_mask=None, 21 | token_type_ids=None, 22 | position_ids=None, 23 | head_mask=None, 24 | inputs_embeds=None, 25 | encoder_hidden_states=None, 26 | encoder_attention_mask=None, 27 | output_attentions=None, 28 | output_hidden_states=None, 29 | normalize=False, 30 | ): 31 | 32 | model_output = super().forward( 33 | input_ids=input_ids, 34 | attention_mask=attention_mask, 35 | token_type_ids=token_type_ids, 36 | position_ids=position_ids, 37 | head_mask=head_mask, 38 | inputs_embeds=inputs_embeds, 39 | encoder_hidden_states=encoder_hidden_states, 40 | encoder_attention_mask=encoder_attention_mask, 41 | output_attentions=output_attentions, 42 | output_hidden_states=output_hidden_states, 43 | ) 44 | 45 | last_hidden = model_output["last_hidden_state"] 46 | last_hidden = last_hidden.masked_fill(~attention_mask[..., None].bool(), 0.0) 47 | 48 | if self.config.pooling == "average": 49 | emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] 50 | elif self.config.pooling == "cls": 51 | emb = last_hidden[:, 0] 52 | 53 | if normalize: 54 | emb = torch.nn.functional.normalize(emb, dim=-1) 55 | return emb 56 | 57 | 58 | class XLMRetriever(XLMRobertaModel): 59 | def __init__(self, config, pooling="average", **kwargs): 60 | super().__init__(config, add_pooling_layer=True) 61 | if not hasattr(config, "pooling"): 62 | self.config.pooling = pooling 63 | 64 | def forward( 65 | self, 66 | input_ids=None, 67 | attention_mask=None, 68 | token_type_ids=None, 69 | position_ids=None, 70 | head_mask=None, 71 | inputs_embeds=None, 72 | encoder_hidden_states=None, 73 | encoder_attention_mask=None, 74 | output_attentions=None, 75 | output_hidden_states=None, 76 | normalize=False, 77 | ): 78 | 79 | model_output = super().forward( 80 | input_ids=input_ids, 81 | attention_mask=attention_mask, 82 | token_type_ids=token_type_ids, 83 | position_ids=position_ids, 84 | head_mask=head_mask, 85 | inputs_embeds=inputs_embeds, 86 | encoder_hidden_states=encoder_hidden_states, 87 | encoder_attention_mask=encoder_attention_mask, 88 | output_attentions=output_attentions, 89 | output_hidden_states=output_hidden_states, 90 | ) 91 | 92 | last_hidden = model_output["last_hidden_state"] 93 | last_hidden = last_hidden.masked_fill(~attention_mask[..., None].bool(), 0.0) 94 | if self.config.pooling == "average": 95 | emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] 96 | elif self.config.pooling == "cls": 97 | emb = last_hidden[:, 0] 98 | if normalize: 99 | emb = torch.nn.functional.normalize(emb, dim=-1) 100 | return emb 101 | 102 | 103 | def load_retriever(model_path, pooling="average", random_init=False): 104 | # try: check if model exists locally 105 | path = os.path.join(model_path, "checkpoint.pth") 106 | if os.path.exists(path): 107 | pretrained_dict = torch.load(path, map_location="gpu") 108 | opt = pretrained_dict["opt"] 109 | print(opt.retriever_model_id) 110 | if hasattr(opt, "retriever_model_id"): 111 | print("here1") 112 | retriever_model_id = opt.retriever_model_id 113 | else: 114 | # retriever_model_id = "bert-base-uncased" 115 | print("here") 116 | retriever_model_id = "bert-base-multilingual-cased" 117 | tokenizer = utils.load_hf(transformers.AutoTokenizer, retriever_model_id) 118 | cfg = utils.load_hf(transformers.AutoConfig, retriever_model_id) 119 | if "xlm" in retriever_model_id: 120 | model_class = XLMRetriever 121 | else: 122 | model_class = Contriever 123 | retriever = model_class(cfg) 124 | pretrained_dict = pretrained_dict["model"] 125 | 126 | if any("encoder_q." in key for key in pretrained_dict.keys()): # test if model is defined with moco class 127 | pretrained_dict = {k.replace("encoder_q.", ""): v for k, v in pretrained_dict.items() if "encoder_q." in k} 128 | elif any("encoder." in key for key in pretrained_dict.keys()): # test if model is defined with inbatch class 129 | pretrained_dict = {k.replace("encoder.", ""): v for k, v in pretrained_dict.items() if "encoder." in k} 130 | retriever.load_state_dict(pretrained_dict, strict=False) 131 | else: 132 | retriever_model_id = model_path 133 | if "xlm" in retriever_model_id: 134 | model_class = XLMRetriever 135 | else: 136 | model_class = Contriever 137 | cfg = utils.load_hf(transformers.AutoConfig, model_path) 138 | tokenizer = utils.load_hf(transformers.AutoTokenizer, model_path) 139 | retriever = utils.load_hf(model_class, model_path) 140 | 141 | return retriever, tokenizer, retriever_model_id 142 | -------------------------------------------------------------------------------- /source/model/flan-t5/flan_seq2seq.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | import pickle 6 | import nltk 7 | 8 | nltk.download("punkt") 9 | 10 | from transformers import ( 11 | AutoTokenizer, 12 | AutoModelForSeq2SeqLM, 13 | DataCollatorForSeq2Seq, 14 | Seq2SeqTrainingArguments, 15 | Seq2SeqTrainer, 16 | ) 17 | from peft import ( 18 | get_peft_model, 19 | TaskType, 20 | LoraConfig, 21 | PrefixTuningConfig, 22 | ) 23 | 24 | from utils import get_data 25 | 26 | 27 | def main(args): 28 | model_name_or_path = args.pretrained_ckpt 29 | 30 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 31 | 32 | # loading dataset 33 | dataset, max_source_length, max_target_length = get_data(tokenizer) 34 | 35 | def preprocess_function(sample, padding="max_length"): 36 | # add prefix to the input for t5 37 | inputs = ["query: " + item for item in sample["question"]] 38 | 39 | # tokenize inputs 40 | model_inputs = tokenizer( 41 | inputs, max_length=max_source_length, padding=padding, truncation=True 42 | ) 43 | 44 | labels = tokenizer( 45 | text_target=sample["answer"], 46 | max_length=max_target_length, 47 | padding=padding, 48 | truncation=True, 49 | ) 50 | 51 | if padding == "max_length": 52 | labels["input_ids"] = [ 53 | [(l if l != tokenizer.pad_token_id else -100) for l in label] 54 | for label in labels["input_ids"] 55 | ] 56 | 57 | model_inputs["labels"] = labels["input_ids"] 58 | return model_inputs 59 | 60 | tokenized_dataset = dataset.map( 61 | preprocess_function, batched=True, remove_columns=["question", "answer"] 62 | ) 63 | print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}") 64 | 65 | print("Getting PEFT method") 66 | 67 | if args.peft_method == "lora": 68 | peft_config = LoraConfig( 69 | task_type=TaskType.SEQ_2_SEQ_LM, 70 | inference_mode=False, 71 | r=args.lora_r, 72 | lora_alpha=32, 73 | lora_dropout=args.dropout, 74 | target_modules=["q", "v"], 75 | ) 76 | results_dir = f"experiments/summarization_{args.peft_method}_epochs-{args.epochs}_r-{args.lora_r}_dropout-{args.dropout}" 77 | 78 | elif args.peft_method == "prefix": 79 | peft_config = PrefixTuningConfig( 80 | task_type=TaskType.SEQ_2_SEQ_LM, 81 | inference_mode=False, 82 | num_virtual_tokens=args.prefix_tokens, 83 | prefix_projection=True if args.prefix_projection else False, 84 | ) 85 | results_dir = f"experiments/summarization_{args.peft_method}_epochs-{args.epochs}_prefixTokens-{args.prefix_tokens}_useProjection-{args.prefix_projection}" 86 | 87 | model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) 88 | if args.peft_method != "sft": 89 | model = get_peft_model(model, peft_config) 90 | print(model.print_trainable_parameters()) 91 | 92 | # Define training args 93 | training_args = Seq2SeqTrainingArguments( 94 | do_train=True, 95 | do_eval=True, 96 | evaluation_strategy="epoch", 97 | logging_strategy="epoch", 98 | save_strategy="no", 99 | per_device_eval_batch_size=8, 100 | per_device_train_batch_size=8, 101 | gradient_accumulation_steps=1, 102 | output_dir=results_dir, 103 | auto_find_batch_size=True, 104 | learning_rate=1e-3, 105 | num_train_epochs=args.epochs, 106 | logging_dir=f"{results_dir}/logs", 107 | report_to="none", 108 | ) 109 | 110 | # we want to ignore tokenizer pad token in the loss 111 | label_pad_token_id = -100 112 | # Data collator 113 | data_collator = DataCollatorForSeq2Seq( 114 | tokenizer, 115 | model=model, 116 | label_pad_token_id=label_pad_token_id, 117 | pad_to_multiple_of=8, 118 | ) 119 | 120 | print(f"training_args = {training_args}") 121 | # Create Trainer instance 122 | trainer = Seq2SeqTrainer( 123 | model=model, 124 | args=training_args, 125 | train_dataset=tokenized_dataset["train"], 126 | eval_dataset=tokenized_dataset["validation"], 127 | data_collator=data_collator, 128 | ) 129 | model.config.use_cache = False 130 | 131 | trainer_stats = trainer.train() 132 | train_loss = trainer_stats.training_loss 133 | eval_stats = trainer.evaluate() 134 | eval_loss = eval_stats["eval_loss"] 135 | print(f"Training loss:{train_loss}|Val loss:{eval_loss}") 136 | 137 | peft_model_id = f"{results_dir}/assets" 138 | trainer.model.save_pretrained(peft_model_id) 139 | tokenizer.save_pretrained(peft_model_id) 140 | 141 | with open(f"{results_dir}/results.pkl", "wb") as handle: 142 | run_result = [ 143 | args.epochs, 144 | args.prefix_tokens, 145 | args.prefix_projection, 146 | train_loss, 147 | eval_loss, 148 | ] 149 | pickle.dump(run_result, handle) 150 | print("Experiment over") 151 | 152 | 153 | if __name__ == "__main__": 154 | parser = argparse.ArgumentParser() 155 | parser.add_argument("--pretrained_ckpt", default="google/flan-t5-large") 156 | parser.add_argument("--peft_method", default="sft") 157 | parser.add_argument("--lora_r", default=16, type=int) 158 | parser.add_argument("--epochs", default=1, type=int) 159 | parser.add_argument("--prefix_tokens", default=20, type=int) 160 | parser.add_argument("--prefix_projection", default=1, type=int) 161 | parser.add_argument("--dropout", default=0.1, type=float) 162 | parser.add_argument("--p_tokens", default=20, type=int) 163 | parser.add_argument("--p_hidden", default=100, type=int) 164 | parser.add_argument("--prompt_tokens", default=20, type=int) 165 | 166 | args = parser.parse_args() 167 | main(args) 168 | -------------------------------------------------------------------------------- /source/model/llama2/examples/finetune_llm/finetune_llama_with_qlora.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | import transformers 5 | from datasets import load_dataset 6 | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training 7 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 8 | BitsAndBytesConfig, DataCollatorForLanguageModeling, 9 | LlamaTokenizer, Trainer, TrainingArguments) 10 | 11 | DEFAULT_PAD_TOKEN = '[PAD]' 12 | DEFAULT_EOS_TOKEN = '' 13 | DEFAULT_BOS_TOKEN = '' 14 | DEFAULT_UNK_TOKEN = '' 15 | 16 | 17 | def print_trainable_parameters(model: AutoModelForCausalLM) -> None: 18 | """ 19 | Prints the number of trainable parameters in the model. 20 | """ 21 | trainable_params, all_param = 0, 0 22 | for _, param in model.named_parameters(): 23 | all_param += param.numel() 24 | if param.requires_grad: 25 | trainable_params += param.numel() 26 | print( 27 | f'trainable params: {trainable_params} || all params: {all_param} || trainable%: \ 28 | {100 * trainable_params / all_param}') 29 | 30 | 31 | def smart_tokenizer_and_embedding_resize( 32 | special_tokens_dict: Dict, 33 | tokenizer: transformers.PreTrainedTokenizer, 34 | model: transformers.PreTrainedModel, 35 | ): 36 | """Resize tokenizer and embedding. 37 | 38 | Note: This is the unoptimized version that may make your embedding size not be divisible by 64. 39 | """ 40 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) 41 | model.resize_token_embeddings(len(tokenizer)) 42 | 43 | if num_new_tokens > 0: 44 | input_embeddings = model.get_input_embeddings().weight.data 45 | output_embeddings = model.get_output_embeddings().weight.data 46 | 47 | input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( 48 | dim=0, keepdim=True) 49 | output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( 50 | dim=0, keepdim=True) 51 | 52 | input_embeddings[-num_new_tokens:] = input_embeddings_avg 53 | output_embeddings[-num_new_tokens:] = output_embeddings_avg 54 | 55 | 56 | if __name__ == '__main__': 57 | model_id = 'decapoda-research/llama-7b-hf' 58 | bnb_config = BitsAndBytesConfig( 59 | load_in_4bit=True, 60 | bnb_4bit_use_double_quant=True, 61 | bnb_4bit_quant_type='nf4', 62 | bnb_4bit_compute_dtype=torch.bfloat16, 63 | ) 64 | """ 65 | - load_in_4bit: The model will be loaded in the memory with 4-bit precision. 66 | - bnb_4bit_use_double_quant: We will do the double quantization proposed by QLoRa. 67 | - bnb_4bit_quant_type: This is the type of quantization. “nf4” stands for 4-bit NormalFloat. 68 | - bnb_4bit_compute_dtype: While we load and store the model in 4-bit, 69 | we will partially dequantize it when needed and do all the computations with a 16-bit precision (bfloat16). 70 | """ 71 | # So now we can load the model in 4-bit: 72 | model = AutoModelForCausalLM.from_pretrained( 73 | model_id, quantization_config=bnb_config, device_map={'': 0}) 74 | 75 | # Then, we enable gradient checkpointing, to reduce the memory footprint of the model: 76 | model.gradient_checkpointing_enable() 77 | # Then, we load the tokenizer: 78 | if model.config.model_type == 'llama': 79 | # Due to the name of Transformers' LlamaTokenizer, we have to do this 80 | tokenizer = LlamaTokenizer.from_pretrained( 81 | model_id, 82 | padding_side='right', 83 | use_fast=True, 84 | ) 85 | else: 86 | tokenizer = AutoTokenizer.from_pretrained( 87 | model_id, 88 | padding_side='right', 89 | use_fast=True, 90 | ) 91 | # Preprocessing the GPT model for LoRa 92 | model = prepare_model_for_kbit_training(model) 93 | # This is where we use PEFT. We prepare the model for LoRa, adding trainable adapters for each layer. 94 | config = LoraConfig( 95 | r=8, 96 | lora_alpha=32, 97 | target_modules=['q_proj', 'v_proj'], 98 | lora_dropout=0.05, 99 | bias='none', 100 | task_type='CAUSAL_LM', 101 | ) 102 | # We can now add the adapters to the model: 103 | model = get_peft_model(model, config) 104 | # We can now print the number of trainable parameters in the model: 105 | print_trainable_parameters(model) 106 | 107 | # Get your dataset ready 108 | # For this demo, I use the “english_quotes” dataset. This is a dataset made of famous quotes distributed under a CC BY 4.0 license. 109 | data = load_dataset('Abirate/english_quotes') 110 | data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) 111 | 112 | # Add special tokens to tokenizer if they are not already present 113 | special_tokens_dict: Dict[str, str] = {} 114 | if tokenizer.pad_token is None: 115 | special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN 116 | if tokenizer.eos_token is None: 117 | special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN 118 | if tokenizer.bos_token is None: 119 | special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN 120 | if tokenizer.unk_token is None: 121 | special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN 122 | 123 | smart_tokenizer_and_embedding_resize( 124 | special_tokens_dict=special_tokens_dict, 125 | tokenizer=tokenizer, 126 | model=model, 127 | ) 128 | 129 | trainer = Trainer( 130 | model=model, 131 | train_dataset=data['train'], 132 | args=TrainingArguments( 133 | per_device_train_batch_size=4, 134 | gradient_accumulation_steps=8, 135 | warmup_steps=2, 136 | max_steps=1000, 137 | learning_rate=2e-4, 138 | fp16=True, 139 | logging_steps=1, 140 | output_dir='outputs', 141 | optim='paged_adamw_8bit', 142 | ), 143 | data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), 144 | ) 145 | model.config.use_cache = False # silence the warnings. Please re-enable for inference! 146 | trainer.train() 147 | --------------------------------------------------------------------------------