├── source
    ├── model
    │   ├── llama2
    │   │   ├── chatllms
    │   │   │   ├── __init__.py
    │   │   │   ├── server
    │   │   │   │   └── __init__.py
    │   │   │   ├── train
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── training.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── stream_server.py
    │   │   │   │   ├── apply_lora.py
    │   │   │   │   └── logger_utils.py
    │   │   │   ├── data
    │   │   │   │   ├── utils
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── convert_alpaca.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── data_loader.py
    │   │   │   ├── evaluation
    │   │   │   │   └── __init__.py
    │   │   │   ├── __version__.py
    │   │   │   ├── model
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── sample_generate_callback.py
    │   │   │   │   ├── compute_metrics.py
    │   │   │   │   └── save_peft_model_callback.py
    │   │   │   └── configs
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── lora_args.py
    │   │   │   │   ├── quant_args.py
    │   │   │   │   ├── model_args.py
    │   │   │   │   ├── infer_args.py
    │   │   │   │   ├── train_args.py
    │   │   │   │   ├── gen_args.py
    │   │   │   │   └── data_args.py
    │   │   ├── assets
    │   │   │   └── wechat.jpg
    │   │   ├── scripts
    │   │   │   ├── server
    │   │   │   │   ├── gradio_base_webserver.sh
    │   │   │   │   ├── run_inference.sh
    │   │   │   │   ├── gradio_webserver.sh
    │   │   │   │   ├── gradio_qlora_webserver.sh
    │   │   │   │   └── apply_lora_to_base_model.sh
    │   │   │   ├── eval.sh
    │   │   │   ├── run.sh
    │   │   │   ├── full_finetune
    │   │   │   │   ├── full-finetune_ds.sh
    │   │   │   │   └── full-finetune.sh
    │   │   │   ├── lora_finetune
    │   │   │   │   ├── lora-finetune.sh
    │   │   │   │   └── lora-finetune_ds.sh
    │   │   │   ├── test_qlora_finetune.sh
    │   │   │   ├── qlora_finetune
    │   │   │   │   ├── finetune_baichuan_7b_vicuna_zh.sh
    │   │   │   │   ├── finetune_llama2_7b_alpaca_zh.sh
    │   │   │   │   ├── finetune_llama_7b_alpaca_zh.sh
    │   │   │   │   └── finetune_baichuan_7b_alpaca_zh.sh
    │   │   │   ├── ds_config
    │   │   │   │   ├── default_offload_opt_param.json
    │   │   │   │   └── ds_config_zero3_auto.json
    │   │   │   └── clean_data.sh
    │   │   ├── data
    │   │   │   ├── run_test.yaml
    │   │   │   ├── belle_group.yaml
    │   │   │   ├── vicuna_zh.yaml
    │   │   │   ├── alpaca_zh.yaml
    │   │   │   ├── alpaca_zh_pcyn.yaml
    │   │   │   ├── vicuna_zh_pcyn.yaml
    │   │   │   ├── dataset_info.yaml
    │   │   │   └── dataset_info.py
    │   │   ├── examples
    │   │   │   ├── clean_sharegpt
    │   │   │   │   ├── clean_evol_instruct.py
    │   │   │   │   └── merge.py
    │   │   │   ├── vllm
    │   │   │   │   ├── vllm_demo.py
    │   │   │   │   └── apil_chient.py
    │   │   │   ├── finetune_llm
    │   │   │   │   ├── baichuan7b_demo.py
    │   │   │   │   └── finetune_llama_with_qlora.py
    │   │   │   ├── format_data
    │   │   │   │   ├── merge.py
    │   │   │   │   ├── convert_oasst1.py
    │   │   │   │   ├── convert_vicuna.py
    │   │   │   │   └── convert_alpaca.py
    │   │   │   └── test_convdataset.py
    │   │   ├── requirements.txt
    │   │   ├── chatbot.py
    │   │   ├── server
    │   │   │   ├── multi_chat.py
    │   │   │   ├── single_chat.py
    │   │   │   └── gradio_base_webserver.py
    │   │   ├── train_qlora.py
    │   │   ├── train.py
    │   │   └── cli_demo.py
    │   ├── flan-t5
    │   │   ├── sample_ablate.sh
    │   │   ├── run_ft.sh
    │   │   ├── run_lora.sh
    │   │   ├── run_prefix.sh
    │   │   ├── utils.py
    │   │   └── flan_seq2seq.py
    │   ├── deepspeed.json
    │   ├── flan_t5_predict.py
    │   ├── gpt_predict.py
    │   └── llama2_predict.py
    └── arch
    │   ├── self_knowledge
    │       └── sk.py
    │   ├── passage_relevance
    │       └── pr.py
    │   └── task_decomposition
    │       └── td.py
├── ra-isf.png
├── evaluation.png
├── retrieval_contriever
    ├── requirements.txt
    ├── README.md
    ├── example_scripts
    │   ├── contriever.sh
    │   └── mcontriever.sh
    ├── evaluate_retrieved_passages.py
    ├── preprocess.py
    ├── src
    │   ├── index.py
    │   ├── inbatch.py
    │   ├── dist_utils.py
    │   ├── slurm.py
    │   ├── normalize_text.py
    │   ├── moco.py
    │   └── contriever.py
    └── generate_passage_embeddings.py
├── requirement.txt
├── run.sh
├── test.py
├── config.py
└── contriever_config.py


/source/model/llama2/chatllms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ra-isf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/ra-isf.png


--------------------------------------------------------------------------------
/evaluation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/evaluation.png


--------------------------------------------------------------------------------
/retrieval_contriever/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.11.0
2 | transformers==4.18.0
3 | beir==1.0.0
4 | 


--------------------------------------------------------------------------------
/source/model/llama2/assets/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/source/model/llama2/assets/wechat.jpg


--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/gradio_base_webserver.sh:
--------------------------------------------------------------------------------
1 | python gradio_base_webserver.py \
2 |     --model_name_or_path /home/robin/work_dir/llm/llm_pretrain_model/baichuan
3 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/__version__.py:
--------------------------------------------------------------------------------
1 | """Version information."""
2 | 
3 | # The following line *must* be the last in the module, exactly as formatted:
4 | __version__ = '0.1.0'
5 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/run_inference.sh:
--------------------------------------------------------------------------------
1 | # generated_chat_vicuna
2 | CUDA_VISIBLE_DEVICES=0 python single_chat.py \
3 |     --model_name_or_path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/merged_model


--------------------------------------------------------------------------------
/source/model/llama2/data/run_test.yaml:
--------------------------------------------------------------------------------
1 | 100PoisonMpts:
2 |   hf_hub_url: 'damo/100PoisonMpts'
3 |   local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json
4 |   dataset_format: alpaca
5 |   multi_turn: False
6 | 


--------------------------------------------------------------------------------
/source/model/flan-t5/sample_ablate.sh:
--------------------------------------------------------------------------------
1 | sample_fraction=(0.025 0.05 0.1 0.25 0.5)
2 | 
3 | for (( sf=0; sf<5; sf=sf+1 )) do
4 | 	python flan_classification.py --train_sample_fraction ${sample_fraction[$sf]} & wait
5 | done
6 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/gradio_webserver.sh:
--------------------------------------------------------------------------------
1 | python gradio_webserver.py \
2 |     --model_name_or_path decapoda-research/llama-7b-hf \
3 |     --lora_model_name_or_path  work_dir/oasst1-llama-7b/checkpoint-414/adapter_model
4 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/eval.sh:
--------------------------------------------------------------------------------
1 | python chatllms/evaluation/evaluate_zh.py \
2 |     --model_name_or_path ~/checkpoints/baichuan7b \
3 |     --split test  \
4 |     --data_path ~/prompt_data/ceval-exam \
5 |     --output_dir ./work_dir/ceval_output
6 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/gradio_qlora_webserver.sh:
--------------------------------------------------------------------------------
1 | python gradio_qlora_webserver.py \
2 |     --model_name_or_path decapoda-research/llama-7b-hf \
3 |     --lora_model_name_or_path ./work_dir/oasst1-llama-7b/checkpoint-831/adapter_model \
4 |     --quant_type nf4 \
5 |     --double_quant \
6 |     --bits 4 \
7 |     --fp16
8 | 


--------------------------------------------------------------------------------
/source/model/flan-t5/run_ft.sh:
--------------------------------------------------------------------------------
 1 | epochs=(2 5 10)
 2 | lora_r=(2 4 8 16)
 3 | dropout=(0.1 0.2)
 4 | 
 5 | for (( epoch=0; epoch<3; epoch=epoch+1 )) do
 6 | 	for ((r=0; r<4; r=r+1 )) do
 7 | 		for (( d=0; d<2; d=d+1 )) do
 8 | 			python flan_seq2seq.py --lora_r ${lora_r[$r]} --epochs ${epochs[$epoch]} --dropout ${dropout[$d]} & wait
 9 | 		done
10 | 	done
11 | done
12 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/apply_lora_to_base_model.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python chatllms/utils/apply_lora.py \
2 |         --base-model-path ~/checkpoints/baichuan7b/ \
3 |         --lora-model-path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/checkpoint-15000 \
4 |         --target-model-path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/merged_model


--------------------------------------------------------------------------------
/source/model/flan-t5/run_lora.sh:
--------------------------------------------------------------------------------
 1 | epochs=(2 5 10)
 2 | lora_r=(2 4 8 16)
 3 | dropout=(0.1 0.2)
 4 | 
 5 | for (( epoch=0; epoch<3; epoch=epoch+1 )) do
 6 | 	for ((r=0; r<4; r=r+1 )) do
 7 | 		for (( d=0; d<2; d=d+1 )) do
 8 | 			python flan_seq2seq.py --peft_method "lora" --lora_r ${lora_r[$r]} --epochs ${epochs[$epoch]} --dropout ${dropout[$d]} & wait
 9 | 		done
10 | 	done
11 | done
12 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from .load_pretrain_model import load_model_tokenizer
 2 | from .mmlueval_callback import MMLUEvalCallback
 3 | from .sample_generate_callback import SampleGenerateCallback
 4 | from .save_peft_model_callback import SavePeftModelCallback
 5 | 
 6 | __all__ = [
 7 |     'load_model_tokenizer', 'MMLUEvalCallback', 'SampleGenerateCallback',
 8 |     'SavePeftModelCallback'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | deepspeed==0.10.1
 3 | flash-attn==2.1.1
 4 | jsonlines==3.1.0
 5 | nltk==3.8.1
 6 | numpy==1.24.4
 7 | openai==0.27.8
 8 | protobuf==4.24.0
 9 | safetensors==0.3.2
10 | sentence-transformers==2.2.2
11 | sentencepiece==0.1.99
12 | spacy==2.2.4
13 | tiktoken==0.5.1
14 | tokenizers==0.15.0
15 | torch==2.0.1+cu118
16 | tqdm==4.66.1
17 | transformers==4.35.2
18 | uvicorn==0.23.2
19 | vllm==0.2.1.post1
20 | 


--------------------------------------------------------------------------------
/source/model/flan-t5/run_prefix.sh:
--------------------------------------------------------------------------------
 1 | epochs=(5 10 15 20)
 2 | prefix_tokens=(10 25 50 100)
 3 | prefix_projection=(0 1)
 4 | 
 5 | for (( epoch=0; epoch<4; epoch=epoch+1 )) do
 6 | 	for ((pt=0; pt<4; pt=pt+1 )) do
 7 | 		for (( proj=0; proj<2; proj=proj+1 )) do
 8 | 			python flan_seq2seq.py --prefix_tokens ${prefix_tokens[$pt]} --epochs ${epochs[$epoch]} --prefix_projection ${prefix_projection[$proj]} & wait
 9 | 		done
10 | 	done
11 | done
12 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/clean_sharegpt/clean_evol_instruct.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from clean_sharegpt import get_clean_data, json_dump
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--in-file', type=str)
 8 |     parser.add_argument('--out-file', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     clean_data2 = get_clean_data(args)
12 |     json_dump(clean_data2, args.out_file)
13 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # nohup sh scripts/finetune/finetune_baichuan_7b_olcc.sh > run2.log 2>&1 &
3 | # nohup sh scripts/multiturn/full-finetune_alpaca_ds.sh  > run2.log 2>&1 &
4 | nohup sh scripts/qlora_finetune/multiturn_llama_finetune.sh > run_vicuna_llama_1gpu.log 2>&1 &
5 | nohup sh scripts/qlora_finetune/multiturn_baichuan_finetune.sh > run_vicuna_baichuan_1gpu.log 2>&1 &
6 | nohup sh scripts/qlora_finetune/finetune_baichuan_7b_olcc.sh > run_zh_baichuan_1gpu.log 2>&1 &
7 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | python main.py --engine "llama2-13b"
2 |       --base_model_path {base_model_path} \
3 |       --self_knowledge_model_path {self_knowledge_model_path} \
4 |       --passage_relevance_model_path {passage_relevance_model_path} \
5 |       --task_decomposition_model_path {task_decomposition_model_path} \
6 |       --data_path {data_path} \
7 |       --n_docs {Number of documents to retrieve per questions} \
8 |       --model_name_or_path {contriever_model_path} \
9 |       --passages_embedding "wikipedia_embeddings/*" \


--------------------------------------------------------------------------------
/source/model/llama2/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | accelerate
 3 | accelerate @ git+https://github.com/huggingface/accelerate.git
 4 | bitsandbytes==0.39.0
 5 | datasets
 6 | deepspeed
 7 | einops==0.6.1
 8 | evaluate>=0.4.0
 9 | gradio
10 | jieba
11 | nltk>=3.8.1
12 | numpy
13 | peft
14 | peft @ git+https://github.com/huggingface/peft.git
15 | rouge-chinese
16 | rouge-score>=0.1.2
17 | sentencepiece
18 | tokenizers
19 | torch
20 | transformers>=4.28.0
21 | transformers @ git+https://github.com/huggingface/transformers.git
22 | wandb==0.15.3
23 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .data_args import DataArguments
 2 | from .gen_args import GenerationArguments
 3 | from .infer_args import ModelInferenceArguments
 4 | from .lora_args import LoraArguments
 5 | from .model_args import ModelArguments
 6 | from .quant_args import QuantArguments
 7 | from .train_args import TrainingArguments
 8 | 
 9 | __all__ = [
10 |     'DataArguments', 'GenerationArguments', 'ModelArguments',
11 |     'TrainingArguments', 'ModelInferenceArguments', 'LoraArguments',
12 |     'QuantArguments'
13 | ]
14 | 


--------------------------------------------------------------------------------
/source/model/deepspeed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": "auto",
 3 |   "zero_allow_untested_optimizer": true,
 4 |   "fp16": {
 5 |     "enabled": "auto",
 6 |     "loss_scale": 0,
 7 |     "initial_scale_power": 16,
 8 |     "loss_scale_window": 1000,
 9 |     "hysteresis": 2,
10 |     "min_loss_scale": 1
11 |   },
12 |   "zero_optimization": {
13 |     "stage": 2,
14 |     "allgather_partitions": true,
15 |     "allgather_bucket_size": 5e8,
16 |     "overlap_comm": false,
17 |     "reduce_scatter": true,
18 |     "reduce_bucket_size": 5e8,
19 |     "contiguous_gradients" : true
20 |   }
21 | }


--------------------------------------------------------------------------------
/retrieval_contriever/README.md:
--------------------------------------------------------------------------------
 1 | ## Retrieval using Contriever
 2 | 
 3 | We utilize the Retriever: [Contriever](https://github.com/facebookresearch/contriever).
 4 | 
 5 | ## References
 6 | 
 7 | ```bibtex
 8 | @misc{izacard2021contriever,
 9 |       title={Unsupervised Dense Information Retrieval with Contrastive Learning}, 
10 |       author={Gautier Izacard and Mathilde Caron and Lucas Hosseini and Sebastian Riedel and Piotr Bojanowski and Armand Joulin and Edouard Grave},
11 |       year={2021},
12 |       url = {https://arxiv.org/abs/2112.09118},
13 |       doi = {10.48550/ARXIV.2112.09118},
14 | }
15 | ```
16 | 
17 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .conv_dataset import make_conversation_data_module
 2 | from .data_loader import make_supervised_data_module
 3 | from .data_utils import (extract_alpaca_prompt_dataset,
 4 |                          extract_default_prompt_dataset,
 5 |                          extract_random_prompt_dataset)
 6 | from .sft_dataset import make_instruction_data_module
 7 | 
 8 | __all__ = [
 9 |     'make_conversation_data_module', 'make_supervised_data_module',
10 |     'make_instruction_data_module', 'extract_random_prompt_dataset',
11 |     'extract_alpaca_prompt_dataset', 'extract_default_prompt_dataset'
12 | ]
13 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/vllm/vllm_demo.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | prompts = [
 4 |     'Hello, my name is',
 5 |     'The president of the United States is',
 6 |     'The capital of France is',
 7 |     'The future of AI is',
 8 | ]
 9 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
10 | 
11 | llm = LLM(model='decapoda-research/llama-7b-hf', gpu_memory_utilization=0.9)
12 | 
13 | # Print the outputs.
14 | for i in range(10):
15 |     outputs = llm.generate(prompts, sampling_params)
16 |     for output in outputs:
17 |         prompt = output.prompt
18 |         generated_text = output.outputs[0].text
19 |         print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}')
20 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/lora_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | 
 4 | @dataclass
 5 | class LoraArguments:
 6 |     # lora中A矩阵的列数量和B矩阵的行数量
 7 |     lora_r: int = field(default=64, metadata={'help': 'Lora R dimension.'})
 8 |     # 缩放因子
 9 |     lora_alpha: float = field(default=16, metadata={'help': ' Lora alpha.'})
10 |     #  dropout，一种正则化方法，可以模仿集成学习
11 |     lora_dropout: float = field(default=0.0,
12 |                                 metadata={'help': 'Lora dropout.'})
13 |     # 每个GPU上可使用的显存大小，以MB为单位。默认是A100高端版本的80GB
14 |     max_memory_MB: int = field(default=80000,
15 |                                metadata={'help': 'Free memory per gpu.'})
16 |     lora_weight_path: str = ''
17 |     bias: str = 'none'
18 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/full_finetune/full-finetune_ds.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 train.py \
 2 |     --model_name_or_path facebook/opt-125m \
 3 |     --data_path ~/prompt_data/InstructionWild/instinwild_en.json  \
 4 |     --output_dir work_dir/alpaca_full-finetune \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy "no" \
10 |     --save_strategy "steps" \
11 |     --save_steps 500 \
12 |     --save_total_limit 5 \
13 |     --learning_rate 2e-5 \
14 |     --weight_decay 0. \
15 |     --warmup_ratio 0.03 \
16 |     --lr_scheduler_type "cosine" \
17 |     --logging_steps 1 \
18 |     --deepspeed "scripts/ds_config/ds_config_zero3_auto.json"
19 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/lora_finetune/lora-finetune.sh:
--------------------------------------------------------------------------------
 1 | python train_lora.py \
 2 |     --model_name_or_path facebook/opt-125m \
 3 |     --dataset_name 100PoisonMpts \
 4 |     --output_dir work_dir/lora-finetune \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy "no" \
10 |     --save_strategy "steps" \
11 |     --save_steps 500 \
12 |     --save_total_limit 5 \
13 |     --learning_rate 1e-4 \
14 |     --weight_decay 0. \
15 |     --warmup_ratio 0.03 \
16 |     --optim "adamw_torch" \
17 |     --lr_scheduler_type "cosine" \
18 |     --model_max_length 1024 \
19 |     --logging_steps 1 \
20 |     --do_train \
21 |     --do_eval \
22 |     --gradient_checkpointing True
23 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/full_finetune/full-finetune.sh:
--------------------------------------------------------------------------------
 1 | python train.py \
 2 |     --model_name_or_path  facebook/opt-125m \
 3 |     --dataset_name share_gpt \
 4 |     --output_dir work_dir/full-finetune \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy "steps" \
10 |     --save_strategy "steps" \
11 |     --eval_steps 1000 \
12 |     --save_steps 1000 \
13 |     --save_total_limit 5 \
14 |     --logging_steps 1 \
15 |     --learning_rate 2e-5 \
16 |     --weight_decay 0. \
17 |     --warmup_ratio 0.03 \
18 |     --optim "adamw_torch" \
19 |     --lr_scheduler_type "cosine" \
20 |     --gradient_checkpointing True \
21 |     --model_max_length 128 \
22 |     --trust_remote_code \
23 |     --do_train \
24 |     --do_eval
25 | 


--------------------------------------------------------------------------------
/source/model/flan_t5_predict.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import os
 4 | import json
 5 | 
 6 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 7 | 
 8 | 
 9 | def model_init(args):
10 |     model_path = args.model_path
11 |     device = torch.device("cuda:0")
12 |     model = AutoModelForSeq2SeqLM.from_pretrained(
13 |         model_path,
14 |         torch_dtype=torch.float16,
15 |     ).to(device)
16 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
17 |     return model, tokenizer, device
18 | 
19 | 
20 | def predict(args, prompt, model, tokenizer):
21 |     inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
22 |     generate_ids = model.generate(**inputs, temperature=args.temperature)
23 |     generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
24 |     infer_res = tokenizer.decode(generate_ids)
25 |     return infer_res
26 | 


--------------------------------------------------------------------------------
/source/model/gpt_predict.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from openai import OpenAI
 4 | 
 5 | def predict(args, prompt):
 6 |     my_key = args.api_key
 7 |     max_length = 256
 8 |     temperature = 0.0
 9 |     top_p = 1
10 |     frequency_penalty = 0
11 |     presence_penalty = 0
12 |     client = OpenAI(api_key = my_key)
13 |     prompt = "
14 |     response = client.completions.create(
15 |         model="gpt-3.5-turbo-instruct", # text-davinci-003 is deprecated
16 |         prompt=prompt,
17 |         max_tokens=max_length,
18 |         temperature=temperature,
19 |         top_p=top_p,
20 |         frequency_penalty=frequency_penalty,
21 |         presence_penalty=presence_penalty,
22 |         #   api_key=my_key,
23 |     )
24 |     if args.engine == 'llama2-13b':
25 |         raise NotImplementedError('Engine false when running gpt3.5: {}'.format(args.engine))
26 |     return response.choices[0].text


--------------------------------------------------------------------------------
/source/model/llama2_predict.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import os
 4 | import json
 5 | from transformers import LlamaTokenizer, LlamaForCausalLM, AutoConfig
 6 | 
 7 | def model_init(model_path):
 8 |     # model_path = args.model_path 
 9 |     device = torch.device("cuda:0")
10 |     model = LlamaForCausalLM.from_pretrained(
11 |         model_path,
12 |         torch_dtype=torch.float16,
13 |     ).to(device)
14 |     tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=False)
15 |     return model, tokenizer
16 | 
17 | def predict(args, prompt, model, tokenizer):
18 |     inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
19 |     generate_ids = model.generate(**inputs, max_length=args.max_length, temperature=args.temperature)
20 |     generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
21 |     infer_res = tokenizer.decode(generate_ids)
22 |     return infer_res
23 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/clean_sharegpt/merge.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Merge two conversation files into one
 3 | 
 4 | Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json
 5 | """
 6 | 
 7 | import argparse
 8 | 
 9 | from clean_sharegpt import json_dump, json_load
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('--in-file', type=str, required=True, nargs='+')
14 |     parser.add_argument('--out-file', type=str, default='merged.json')
15 |     args = parser.parse_args()
16 | 
17 |     new_content = []
18 |     for in_file in args.in_file:
19 |         content = json_load(in_file)
20 |         print(f'in-file: {in_file}, len: {len(content)}')
21 |         new_content.extend(content)
22 | 
23 |     print(f'#out: {len(new_content)}')
24 |     print(f'Save new_content to {args.out_file}')
25 |     json_dump(new_content, args.out_file)
26 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/lora_finetune/lora-finetune_ds.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 train_lora.py \
 2 |     --model_name_or_path facebook/opt-125m \
 3 |     --data_path ~/prompt_data/InstructionWild/instinwild_en.json  \
 4 |     --output_dir work_dir/alpaca_full-finetune \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy "no" \
10 |     --save_strategy "steps" \
11 |     --save_steps 500 \
12 |     --save_total_limit 5 \
13 |     --learning_rate 2e-5 \
14 |     --weight_decay 0. \
15 |     --warmup_ratio 0.03 \
16 |     --optim "adamw_torch" \
17 |     --lr_scheduler_type "cosine" \
18 |     --model_max_length 2048 \
19 |     --logging_steps 1 \
20 |     --do_train \
21 |     --do_eval \
22 |     --gradient_checkpointing True \
23 |     --deepspeed "scripts/ds_config/ds_config_zero3_auto.json"
24 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/finetune_llm/baichuan7b_demo.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | 
 4 | 
 5 | def main(load_in_8bit=True, model_path=''):
 6 |     tokenizer = AutoTokenizer.from_pretrained(
 7 |         pretrained_model_name_or_path=model_path, trust_remote_code=True)
 8 |     model = AutoModelForCausalLM.from_pretrained(
 9 |         pretrained_model_name_or_path=model_path,
10 |         load_in_8bit=load_in_8bit,
11 |         torch_dtype=torch.float16,
12 |         device_map='auto',
13 |         trust_remote_code=True)
14 |     inputs = tokenizer('登鹳雀楼->王之涣\n夜雨寄北->', return_tensors='pt')
15 |     inputs = inputs.to('cuda:0')
16 |     pred = model.generate(**inputs, max_new_tokens=64)
17 |     print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     load_in_8bit = True
22 |     model_path = '/home/robin/work_dir/llm/llm_pretrain_model/baichuan'
23 |     main(load_in_8bit, model_path)
24 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatbot.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import gradio as gr
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     openai.api_key = "Your API key"
 7 | 
 8 |     messages = [
 9 |         {"role": "system", "content": "You are a helpful and kind AI Assistant."},
10 |     ]
11 | 
12 |     def chatbot(input):
13 |         if input:
14 |             messages.append({"role": "user", "content": input})
15 |             chat = openai.ChatCompletion.create(
16 |                 model="gpt-3.5-turbo", messages=messages
17 |             )
18 |             reply = chat.choices[0].message.content
19 |             messages.append({"role": "assistant", "content": reply})
20 |             return reply
21 | 
22 |     inputs = gr.inputs.Textbox(lines=7, label="Chat with AI")
23 |     outputs = gr.outputs.Textbox(label="Reply")
24 | 
25 |     gr.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="AI Chatbot",
26 |                 description="Ask anything you want",
27 |                 theme="compact").launch(share=True)


--------------------------------------------------------------------------------
/source/model/llama2/data/belle_group.yaml:
--------------------------------------------------------------------------------
 1 | belle_0.5m:
 2 |   hf_hub_url: BelleGroup/train_0.5M_CN
 3 |   local_path: ''
 4 |   dataset_format: alpaca
 5 |   multi_turn: False
 6 | 
 7 | belle_1m:
 8 |   hf_hub_url: BelleGroup/train_1M_CN
 9 |   local_path: ''
10 |   dataset_format: alpaca
11 |   multi_turn: False
12 | 
13 | belle_2m:
14 |   hf_hub_url: BelleGroup/train_2M_CN
15 |   local_path: ''
16 |   dataset_format: alpaca
17 |   multi_turn: False
18 | 
19 | belle_dialog:
20 |   hf_hub_url: BelleGroup/generated_chat_0.4M
21 |   local_path: ''
22 |   dataset_format: belle_dialog
23 |   multi_turn: False
24 | 
25 | belle_math:
26 |   hf_hub_url: BelleGroup/school_math_0.25M
27 |   local_path: ''
28 |   dataset_format: alpaca
29 |   multi_turn: False
30 | 
31 | belle_multiturn:
32 |   hf_hub_url: BelleGroup/multi_turn_0.5M
33 |   local_path: ''
34 |   dataset_format: belle_multiturn
35 |   multi_turn: True
36 |   columns:
37 |     prompt: instruction
38 |     query: ''
39 |     response: output
40 |     history: history
41 | 


--------------------------------------------------------------------------------
/source/arch/self_knowledge/sk.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import re
 4 | import string
 5 | 
 6 | import os
 7 | import argparse
 8 | import csv
 9 | import json
10 | import logging
11 | import pickle
12 | import time
13 | import glob
14 | 
15 | import numpy as np
16 | import torch
17 | import transformers
18 | 
19 | class Self_Knowledge_Model():
20 |     def __init__(self, model, tokenizer):
21 |         self.model = model
22 |         self.tokenizer = tokenizer
23 | 
24 |     def find_known(self, context, query):
25 |         inputs = tokenizer(context + query, return_tensors="pt").to('cuda')
26 |         generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature)
27 |         generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
28 |         result = tokenizer.decode(generate_ids)
29 |         if result == "know":
30 |             return True
31 |         elif result == "unknow":
32 |             return False
33 |         else:
34 |             print(f"Invalid output on SKM query: {context + query}")
35 |             return False
36 | 


--------------------------------------------------------------------------------
/source/arch/passage_relevance/pr.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import re
 4 | import string
 5 | 
 6 | import os
 7 | import argparse
 8 | import csv
 9 | import json
10 | import logging
11 | import pickle
12 | import time
13 | import glob
14 | 
15 | import numpy as np
16 | import torch
17 | import transformers
18 | 
19 | class Passage_Relevance_Model():
20 |     def __init__(self, model, tokenizer):
21 |         self.model = model
22 |         self.tokenizer = tokenizer
23 | 
24 |     def find_relevance(self, context, query, passage):
25 |         inputs = tokenizer(context + query + "\nPassage: " + passage, return_tensors="pt").to('cuda')
26 |         generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature)
27 |         generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
28 |         result = tokenizer.decode(generate_ids)
29 |         if result == "relevance":
30 |             return True
31 |         elif result == "irrelevance":
32 |             return False
33 |         else:
34 |             print(f"Invalid output on PRM query: {context + query}")
35 |             return False
36 | 


--------------------------------------------------------------------------------
/source/arch/task_decomposition/td.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import re
 4 | import string
 5 | 
 6 | import os
 7 | import argparse
 8 | import csv
 9 | import json
10 | import logging
11 | import pickle
12 | import time
13 | import glob
14 | 
15 | import numpy as np
16 | import torch
17 | import transformers
18 | 
19 | class Task_Decomposition_Model():
20 |     def __init__(self, model, tokenizer):
21 |         self.model = model
22 |         self.tokenizer = tokenizer
23 |         self.query_list = list()
24 | 
25 |     def decompose(self, context, query):
26 |         inputs = tokenizer(context + query, return_tensors="pt").to('cuda')
27 |         generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature)
28 |         generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
29 |         result = tokenizer.decode(generate_ids)
30 |         try:
31 |             data = json.loads(result)
32 |             for idx, q in data['query']:
33 |                 self.query_list.append(q)
34 |         except json.JSONDecodeError:
35 |             print(f"Invalid format on TDM query: {context + query}, json_string: {result}")
36 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/test_qlora_finetune.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 python train_qlora.py \
 2 |     --model_name_or_path facebook/opt-125m \
 3 |     --dataset_name olcc \
 4 |     --output_dir ./work_dir/run_test \
 5 |     --num_train_epochs 3 \
 6 |     --max_train_samples 100 \
 7 |     --max_eval_samples 100 \
 8 |     --per_device_train_batch_size 4 \
 9 |     --per_device_eval_batch_size 4 \
10 |     --gradient_accumulation_steps 8 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 50 \
13 |     --save_strategy steps \
14 |     --save_total_limit 5 \
15 |     --save_steps 100 \
16 |     --logging_strategy steps \
17 |     --logging_steps 1 \
18 |     --learning_rate 0.0002 \
19 |     --warmup_ratio 0.03 \
20 |     --weight_decay 0.0 \
21 |     --lr_scheduler_type constant \
22 |     --adam_beta2 0.999 \
23 |     --max_grad_norm 0.3 \
24 |     --max_new_tokens 32 \
25 |     --lora_r 64 \
26 |     --lora_alpha 16 \
27 |     --lora_dropout 0.1 \
28 |     --double_quant \
29 |     --quant_type nf4 \
30 |     --fp16 \
31 |     --bits 4 \
32 |     --gradient_checkpointing \
33 |     --trust_remote_code \
34 |     --do_train \
35 |     --do_eval \
36 |     --sample_generate \
37 |     --data_seed 42 \
38 |     --seed 0
39 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_baichuan_7b_vicuna_zh.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=1 python train_qlora.py \
 2 |     --model_name_or_path ~/checkpoints/baichuan7b \
 3 |     --dataset_cfg ./data/vicuna_zh_pcyn.yaml \
 4 |     --output_dir ./work_dir/vicuna_zh-baichuan-7b \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 2 \
 7 |     --per_device_eval_batch_size 2 \
 8 |     --gradient_accumulation_steps 16 \
 9 |     --evaluation_strategy steps \
10 |     --eval_steps 1000 \
11 |     --save_strategy steps \
12 |     --save_total_limit 10 \
13 |     --save_steps 1000 \
14 |     --logging_strategy steps \
15 |     --logging_steps 5 \
16 |     --learning_rate 0.0002 \
17 |     --warmup_ratio 0.03 \
18 |     --weight_decay 0.0 \
19 |     --lr_scheduler_type constant \
20 |     --adam_beta2 0.999 \
21 |     --max_grad_norm 0.3 \
22 |     --lora_r 64 \
23 |     --lora_alpha 16 \
24 |     --lora_dropout 0.1 \
25 |     --double_quant \
26 |     --quant_type nf4 \
27 |     --fp16 \
28 |     --bits 4 \
29 |     --model_max_length 1024 \
30 |     --gradient_checkpointing \
31 |     --trust_remote_code True \
32 |     --use_auth_token True \
33 |     --do_train \
34 |     --do_eval \
35 |     --data_seed 42 \
36 |     --seed 0
37 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_llama2_7b_alpaca_zh.sh:
--------------------------------------------------------------------------------
 1 | python train_qlora.py \
 2 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
 3 |     --dataset_cfg ./data/alpaca_zh_pcyn.yaml \
 4 |     --output_dir ./work_dir/alpaca_zh_llama2-7b \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy steps \
10 |     --eval_steps 1000 \
11 |     --save_strategy steps \
12 |     --save_total_limit 10 \
13 |     --save_steps 1000 \
14 |     --logging_strategy steps \
15 |     --logging_steps 5 \
16 |     --learning_rate 0.0002 \
17 |     --warmup_ratio 0.03 \
18 |     --weight_decay 0.0 \
19 |     --lr_scheduler_type constant \
20 |     --adam_beta2 0.999 \
21 |     --max_grad_norm 0.3 \
22 |     --lora_r 64 \
23 |     --lora_alpha 16 \
24 |     --lora_dropout 0.1 \
25 |     --double_quant \
26 |     --quant_type nf4 \
27 |     --fp16 \
28 |     --bits 4 \
29 |     --model_max_length 1024 \
30 |     --gradient_checkpointing \
31 |     --trust_remote_code True \
32 |     --use_auth_token True \
33 |     --do_train \
34 |     --do_eval \
35 |     --sample_generate \
36 |     --data_seed 42 \
37 |     --seed 0
38 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_llama_7b_alpaca_zh.sh:
--------------------------------------------------------------------------------
 1 | python train_qlora.py \
 2 |     --model_name_or_path decapoda-research/llama-7b-hf \
 3 |     --dataset_cfg ./data/alpaca_zh_pcyn.yaml \
 4 |     --output_dir ./work_dir/alpaca_zh-baichuan-7b \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy steps \
10 |     --eval_steps 1000 \
11 |     --save_strategy steps \
12 |     --save_total_limit 10 \
13 |     --save_steps 1000 \
14 |     --logging_strategy steps \
15 |     --logging_steps 5 \
16 |     --learning_rate 0.0002 \
17 |     --warmup_ratio 0.03 \
18 |     --weight_decay 0.0 \
19 |     --lr_scheduler_type constant \
20 |     --adam_beta2 0.999 \
21 |     --max_grad_norm 0.3 \
22 |     --lora_r 64 \
23 |     --lora_alpha 16 \
24 |     --lora_dropout 0.1 \
25 |     --double_quant \
26 |     --quant_type nf4 \
27 |     --fp16 \
28 |     --bits 4 \
29 |     --model_max_length 1024 \
30 |     --gradient_checkpointing \
31 |     --trust_remote_code True \
32 |     --use_auth_token True \
33 |     --do_train \
34 |     --do_eval \
35 |     --sample_generate \
36 |     --data_seed 42 \
37 |     --seed 0
38 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/quant_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | 
 4 | @dataclass
 5 | class QuantArguments:
 6 |     # 使用8-bit的adam，是否可以调整为LION或Sophia，甚至deepspeed还提供了多个1-bit优化器选择
 7 |     adam8bit: bool = field(default=False, metadata={'help': 'Use 8-bit adam.'})
 8 |     # 是否使用二次量化
 9 |     double_quant: bool = field(
10 |         default=True,
11 |         metadata={
12 |             'help':
13 |             'Compress the quantization statistics through double quantization.'
14 |         })
15 |     # 量化类型，可以选择`fp4`或`nf4`
16 |     quant_type: str = field(
17 |         default='nf4',
18 |         metadata={
19 |             'help':
20 |             'Quantization data type to use. Should be one of `fp4` or `nf4`.'
21 |         })
22 |     # 使用的位宽，默认为4。
23 |     bits: int = field(default=4, metadata={'help': 'How many bits to use.'})
24 | 
25 |     def __post_init__(self):
26 |         if self.bits is not None:
27 |             assert self.bits in [
28 |                 4, 8
29 |             ], 'We only accept 4-bit or 8-bit quantization.'
30 | 
31 |         if self.quant_type is not None:
32 |             assert self.quant_type in [
33 |                 'nf4', 'fp4'
34 |             ], 'We only accept `nf4` or `fp4` quantization type.'
35 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_baichuan_7b_alpaca_zh.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 python train_qlora.py \
 2 |     --model_name_or_path ~/checkpoints/baichuan7b \
 3 |     --dataset_cfg ./data/alpaca_zh_pcyn.yaml \
 4 |     --output_dir ./work_dir/alpaca_zh-baichuan-7b \
 5 |     --num_train_epochs 3 \
 6 |     --per_device_train_batch_size 4 \
 7 |     --per_device_eval_batch_size 4 \
 8 |     --gradient_accumulation_steps 8 \
 9 |     --evaluation_strategy steps \
10 |     --eval_steps 1000 \
11 |     --save_strategy steps \
12 |     --save_total_limit 10 \
13 |     --save_steps 1000 \
14 |     --logging_strategy steps \
15 |     --logging_steps 5 \
16 |     --learning_rate 0.0002 \
17 |     --warmup_ratio 0.03 \
18 |     --weight_decay 0.0 \
19 |     --lr_scheduler_type constant \
20 |     --adam_beta2 0.999 \
21 |     --max_grad_norm 0.3 \
22 |     --lora_r 64 \
23 |     --lora_alpha 16 \
24 |     --lora_dropout 0.1 \
25 |     --double_quant \
26 |     --quant_type nf4 \
27 |     --fp16 \
28 |     --bits 4 \
29 |     --model_max_length 1024 \
30 |     --gradient_checkpointing \
31 |     --trust_remote_code True \
32 |     --use_auth_token True \
33 |     --do_train \
34 |     --do_eval \
35 |     --sample_generate \
36 |     --data_seed 42 \
37 |     --seed 0
38 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer
 3 | from retrieval_contriever.src.contriever import Contriever
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/contriever-msmarco')
 6 | model = Contriever.from_pretrained('/root/autodl-tmp/contriever-msmarco')
 7 | 
 8 | sentences = [
 9 |     "Where was Marie Curie born?",
10 |     "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
11 |     "Born in Paris on 15 May 1859, 111111 Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
12 | ]
13 | 
14 | # Apply tokenizer
15 | inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
16 | 
17 | # Compute token embeddings
18 | outputs = model(**inputs)
19 | 
20 | # Mean pooling
21 | def mean_pooling(token_embeddings, mask):
22 |     token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
23 |     sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
24 |     return sentence_embeddings
25 | # embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
26 | embeddings = outputs
27 | # print(embeddings[0])
28 | # print(embeddings[1])
29 | score1 = embeddings[0] @ embeddings[1]
30 | score2 = embeddings[0] @ embeddings[2]
31 | print(score1)
32 | print(score2)


--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/merge.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Merge two conversation files into one
 3 | 
 4 | Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json
 5 | """
 6 | 
 7 | import argparse
 8 | import json
 9 | 
10 | from datasets import load_dataset
11 | 
12 | 
13 | def json_load(in_file):
14 |     with open(in_file, 'r') as f:
15 |         json_data = json.load(f)
16 |     return json_data
17 | 
18 | 
19 | def json_dump(obj, path):
20 |     with open(path, 'w', encoding='utf-8') as f:
21 |         json.dump(obj, f, indent=2, ensure_ascii=False)
22 | 
23 | 
24 | def merge_datasets(in_file_list, out_file):
25 | 
26 |     new_content = []
27 |     for in_file in in_file_list:
28 |         content = load_dataset('json', data_files=in_file)['train']
29 | 
30 |         print(f'in-file: {in_file}, len: {len(content)}')
31 |         new_content.extend(content)
32 | 
33 |     print(f'#out: {len(new_content)}')
34 |     print(f'Save new_content to {out_file}')
35 |     json_dump(new_content, out_file)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('--in-file', type=str, required=True, nargs='+')
41 |     parser.add_argument('--out-file', type=str, default='merged.json')
42 |     args = parser.parse_args()
43 | 
44 |     merge_datasets(args.in_file, args.out_file)
45 | 


--------------------------------------------------------------------------------
/source/model/llama2/data/vicuna_zh.yaml:
--------------------------------------------------------------------------------
 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
 2 | coig:
 3 |   hf_hub_url: BAAI/COIG
 4 |   local_path: /home/robin/prompt_data/COIG/train_vicuna.json
 5 |   dataset_format: sharegpt
 6 |   multi_turn: True
 7 | 
 8 | cvalues_comparison_train:
 9 |   hf_hub_url: ''
10 |   local_path: /home/robin/prompt_data/CValues-Comparison/train_vicuna.json
11 |   dataset_format: sharegpt
12 |   multi_turn: True
13 | 
14 | cvalues_comparison_test:
15 |   hf_hub_url: ''
16 |   local_path: /home/robin/prompt_data/CValues-Comparison/test_vicuna.json
17 |   dataset_format: sharegpt
18 |   multi_turn: True
19 | 
20 | olcc:
21 |   hf_hub_url: ''
22 |   local_path: /home/robin/prompt_data/olcc/olcc_vicuna.json
23 |   dataset_format: sharegpt
24 |   multi_turn: True
25 | 
26 | 100PoisonMpts:
27 |   hf_hub_url: ''
28 |   local_path: /home/robin/prompt_data/100PoisonMpts/train_vicuna.json
29 |   dataset_format: sharegpt
30 |   multi_turn: True
31 | 
32 | safety_prompt_part1:
33 |   hf_hub_url: ''
34 |   local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json
35 |   dataset_format: sharegpt
36 |   multi_turn: True
37 | 
38 | safety_prompt_part2:
39 |   hf_hub_url: ''
40 |   local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json
41 |   dataset_format: sharegpt
42 |   multi_turn: True
43 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/model_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class ModelArguments:
 7 |     model_name_or_path: Optional[str] = field(
 8 |         default='facebook/opt-125m',
 9 |         metadata={
10 |             'help':
11 |             ("The model checkpoint for weights initialization. Don't set if you want to\
12 |               train a model from scratch.")
13 |         },
14 |     )
15 |     tokenizer_name: Optional[str] = field(
16 |         default=None,
17 |         metadata={
18 |             'help':
19 |             'Pretrained tokenizer name or path if not the same as model_name'
20 |         })
21 |     model_revision: str = field(
22 |         default='main',
23 |         metadata={
24 |             'help':
25 |             'The specific model version to use (can be a branch name, tag name or commit id).'
26 |         },
27 |     )
28 |     trust_remote_code: Optional[bool] = field(
29 |         default=False,
30 |         metadata={
31 |             'help':
32 |             'Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained.'
33 |         })
34 |     use_auth_token: Optional[bool] = field(
35 |         default=False,
36 |         metadata={
37 |             'help':
38 |             'Enables using Huggingface auth token from Git Credentials.'
39 |         })
40 | 


--------------------------------------------------------------------------------
/source/model/llama2/data/alpaca_zh.yaml:
--------------------------------------------------------------------------------
 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
 2 | coig:
 3 |   hf_hub_url: BAAI/COIG
 4 |   local_path: /home/robin/prompt_data/COIG/train_alpaca.json
 5 |   dataset_format: alpaca
 6 |   multi_turn: False
 7 | 
 8 | cvalues_comparison_train:
 9 |   hf_hub_url: ''
10 |   local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json
11 |   dataset_format: alpaca
12 |   multi_turn: False
13 | 
14 | cvalues_comparison_test:
15 |   hf_hub_url: ''
16 |   local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json
17 |   dataset_format: alpaca
18 |   multi_turn: False
19 | 
20 | olcc:
21 |   hf_hub_url: ''
22 |   local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json
23 |   dataset_format: alpaca
24 |   multi_turn: False
25 | 
26 | 100PoisonMpts:
27 |   hf_hub_url: 'damo/100PoisonMpts'
28 |   local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json
29 |   dataset_format: alpaca
30 |   multi_turn: False
31 | 
32 | safety_prompt_part1:
33 |   hf_hub_url: ''
34 |   local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json
35 |   dataset_format: alpaca
36 |   multi_turn: False
37 | 
38 | safety_prompt_part2:
39 |   hf_hub_url: ''
40 |   local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json
41 |   dataset_format: alpaca
42 |   multi_turn: False
43 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/ds_config/default_offload_opt_param.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bf16": {
 3 |     "enabled": "auto"
 4 |   },
 5 |   "optimizer": {
 6 |     "type": "AdamW",
 7 |     "params": {
 8 |       "lr": "auto",
 9 |       "betas": "auto",
10 |       "eps": "auto",
11 |       "weight_decay": "auto"
12 |     }
13 |   },
14 |   "scheduler": {
15 |     "type": "WarmupDecayLR",
16 |     "params": {
17 |       "total_num_steps": "auto",
18 |       "warmup_min_lr": "auto",
19 |       "warmup_max_lr": "auto",
20 |       "warmup_num_steps": "auto"
21 |     }
22 |   },
23 |   "zero_optimization": {
24 |     "stage": 3,
25 |     "offload_optimizer": {
26 |       "device": "cpu",
27 |       "pin_memory": true
28 |     },
29 |     "offload_param": {
30 |       "device": "cpu",
31 |       "pin_memory": true
32 |     },
33 |     "overlap_comm": true,
34 |     "contiguous_gradients": true,
35 |     "sub_group_size": 1e9,
36 |     "reduce_bucket_size": "auto",
37 |     "stage3_prefetch_bucket_size": "auto",
38 |     "stage3_param_persistence_threshold": "auto",
39 |     "stage3_max_live_parameters": 1e9,
40 |     "stage3_max_reuse_distance": 1e9,
41 |     "stage3_gather_16bit_weights_on_model_save": false
42 |   },
43 |   "gradient_accumulation_steps": "auto",
44 |   "gradient_clipping": "auto",
45 |   "steps_per_print": 5,
46 |   "train_batch_size": "auto",
47 |   "train_micro_batch_size_per_gpu": "auto",
48 |   "wall_clock_breakdown": false
49 | }
50 | 


--------------------------------------------------------------------------------
/source/model/llama2/data/alpaca_zh_pcyn.yaml:
--------------------------------------------------------------------------------
 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
 2 | coig:
 3 |   hf_hub_url: BAAI/COIG
 4 |   local_path: /userhome/jianzhnie/prompt_data/COIG/train_alpaca.json
 5 |   dataset_format: alpaca
 6 |   multi_turn: False
 7 | 
 8 | cvalues_comparison_train:
 9 |   hf_hub_url: ''
10 |   local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_alpaca.json
11 |   dataset_format: alpaca
12 |   multi_turn: False
13 | 
14 | cvalues_comparison_test:
15 |   hf_hub_url: ''
16 |   local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_alpaca.json
17 |   dataset_format: alpaca
18 |   multi_turn: False
19 | 
20 | olcc:
21 |   hf_hub_url: ''
22 |   local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_alpaca.json
23 |   dataset_format: alpaca
24 |   multi_turn: False
25 | 
26 | 100PoisonMpts:
27 |   hf_hub_url: ''
28 |   local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_alpaca.json
29 |   dataset_format: alpaca
30 |   multi_turn: False
31 | 
32 | safety_prompt_part1:
33 |   hf_hub_url: ''
34 |   local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json
35 |   dataset_format: alpaca
36 |   multi_turn: False
37 | 
38 | safety_prompt_part2:
39 |   hf_hub_url: ''
40 |   local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json
41 |   dataset_format: alpaca
42 |   multi_turn: False
43 | 


--------------------------------------------------------------------------------
/source/model/llama2/data/vicuna_zh_pcyn.yaml:
--------------------------------------------------------------------------------
 1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
 2 | coig:
 3 |   hf_hub_url: BAAI/COIG
 4 |   local_path: /userhome/jianzhnie/prompt_data/COIG/train_vicuna.json
 5 |   dataset_format: sharegpt
 6 |   multi_turn: True
 7 | 
 8 | cvalues_comparison_train:
 9 |   hf_hub_url: ''
10 |   local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_vicuna.json
11 |   dataset_format: sharegpt
12 |   multi_turn: True
13 | 
14 | cvalues_comparison_test:
15 |   hf_hub_url: ''
16 |   local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_vicuna.json
17 |   dataset_format: sharegpt
18 |   multi_turn: True
19 | 
20 | olcc:
21 |   hf_hub_url: ''
22 |   local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_vicuna.json
23 |   dataset_format: sharegpt
24 |   multi_turn: True
25 | 
26 | 100PoisonMpts:
27 |   hf_hub_url: ''
28 |   local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_vicuna.json
29 |   dataset_format: sharegpt
30 |   multi_turn: True
31 | 
32 | safety_prompt_part1:
33 |   hf_hub_url: ''
34 |   local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json
35 |   dataset_format: sharegpt
36 |   multi_turn: True
37 | 
38 | safety_prompt_part2:
39 |   hf_hub_url: ''
40 |   local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json
41 |   dataset_format: sharegpt
42 |   multi_turn: True
43 | 


--------------------------------------------------------------------------------
/retrieval_contriever/example_scripts/contriever.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --cpus-per-task=5
 3 | #SBATCH --nodes=4
 4 | #SBATCH --ntasks-per-node=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --time=72:00:00
 7 | #SBATCH --job-name=contriever
 8 | #SBATCH --output=/private/home/gizacard/contriever/logtrain/%A
 9 | #SBATCH --partition=learnlab
10 | #SBATCH --mem=450GB
11 | #SBATCH --signal=USR1@140
12 | #SBATCH --open-mode=append
13 | 
14 | 
15 | port=$(shuf -i 15000-16000 -n 1)
16 | TDIR="/private/home/gizacard/contriever/encoded-data"
17 | TRAINDATASETS="${TDIR}/wikisub/ ${TDIR}/cc-netsub/"
18 | 
19 | rmin=0.05
20 | rmax=0.5
21 | T=0.05
22 | QSIZE=131072
23 | MOM=0.9995
24 | POOL=average
25 | AUG=delete
26 | PAUG=0.1
27 | LC=0.
28 | mo=bert-base-uncased
29 | mp=none
30 | 
31 | name=$SLURM_JOB_ID-$POOL-rmin$rmin-rmax$rmax-T$T-$QSIZE-$MOM-$mo-$AUG-$PAUG
32 | 
33 | srun ~oceanntwt/anaconda3/envs/contriever/bin/python3 train.py \
34 |         --model_path $mp \
35 |         --sampling_coefficient $LC \
36 |         --retriever_model_id $mo --pooling $POOL \
37 |         --augmentation $AUG --prob_augmentation $PAUG \
38 |         --train_data $TRAINDATASETS --loading_mode split \
39 |         --ratio_min $rmin --ratio_max $rmax --chunk_length 256 \
40 |         --momentum $MOM --queue_size $QSIZE --temperature $T \
41 |         --warmup_steps 20000 --total_steps 500000 --lr 0.00005 \
42 |         --name $name \
43 |         --scheduler linear \
44 |         --optim adamw \
45 |         --per_gpu_batch_size 64 \
46 |         --output_dir /checkpoint/oceanntwt/contriever/$name \
47 |         --main_port $port \
48 | 
49 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/ds_config/ds_config_zero3_auto.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "scheduler": {
23 |         "type": "WarmupDecayLR",
24 |         "params": {
25 |             "total_num_steps": "auto",
26 |             "warmup_min_lr": "auto",
27 |             "warmup_max_lr": "auto",
28 |             "warmup_num_steps": "auto"
29 |         }
30 |     },
31 |     "zero_optimization": {
32 |         "stage": 3,
33 |         "offload_optimizer": {
34 |             "device": "cpu",
35 |             "pin_memory": true
36 |         },
37 |         "offload_param": {
38 |             "device": "cpu",
39 |             "pin_memory": true
40 |         },
41 |         "overlap_comm": true,
42 |         "contiguous_gradients": true,
43 |         "allgather_partitions": true,
44 |         "allgather_bucket_size": 5e8,
45 |         "sub_group_size": 1e9,
46 |         "reduce_bucket_size": "auto",
47 |         "stage3_prefetch_bucket_size": "auto",
48 |         "stage3_param_persistence_threshold": "auto",
49 |         "stage3_max_live_parameters": 1e9,
50 |         "stage3_max_reuse_distance": 1e9,
51 |         "stage3_gather_16bit_weights_on_model_save": true
52 |     },
53 |     "train_batch_size": "auto",
54 |     "train_micro_batch_size_per_gpu": "auto",
55 |     "gradient_accumulation_steps": "auto",
56 |     "gradient_clipping": "auto",
57 |     "steps_per_print": 5,
58 |     "wall_clock_breakdown": false
59 | }
60 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/infer_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class ModelInferenceArguments:
 7 |     cache_dir: Optional[str] = field(default=None)
 8 |     model_name_or_path: Optional[str] = field(
 9 |         default='facebook/opt-125m',
10 |         metadata={'help': 'Path to pre-trained model'})
11 |     model_revision: str = field(
12 |         default='main',
13 |         metadata={
14 |             'help':
15 |             'The specific model version to use (can be a branch name, tag name or commit id).'
16 |         },
17 |     )
18 |     trust_remote_code: Optional[bool] = field(
19 |         default=False,
20 |         metadata={
21 |             'help':
22 |             'Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained.'
23 |         })
24 |     use_auth_token: Optional[bool] = field(
25 |         default=False,
26 |         metadata={
27 |             'help':
28 |             'Enables using Huggingface auth token from Git Credentials.'
29 |         })
30 |     model_max_length: int = field(
31 |         default=2048,
32 |         metadata={
33 |             'help':
34 |             'Maximum sequence length. Sequences will be right padded (and possibly truncated).'
35 |         },
36 |     )
37 |     low_cpu_mem_usage: bool = field(
38 |         default=True,
39 |         metadata={'help': 'Whether to use low cpu memory usage mode.'})
40 |     fp16: bool = field(default=False,
41 |                        metadata={'help': 'Whether to use fp16.'})
42 |     prompt_template: str = field(
43 |         default='default',
44 |         metadata={
45 |             'help':
46 |             'Prompt template name. Such as vanilla, alpaca, llama2, vicuna..., etc.'
47 |         })
48 |     source_prefix: Optional[str] = field(
49 |         default=None,
50 |         metadata={'help': 'Prefix to prepend to every source text.'})
51 | 


--------------------------------------------------------------------------------
/retrieval_contriever/example_scripts/mcontriever.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --cpus-per-task=5
 3 | #SBATCH --nodes=8
 4 | #SBATCH --ntasks-per-node=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --time=72:00:00
 7 | #SBATCH --job-name=mcontriever
 8 | #SBATCH --output=/private/home/oceanntwt/contriever/logtrain/%A
 9 | #SBATCH --partition=learnlab
10 | #SBATCH --mem=450GB
11 | #SBATCH --signal=USR1@140
12 | #SBATCH --open-mode=append
13 | 
14 | 
15 | port=$(shuf -i 15000-16000 -n 1)
16 | 
17 | TDIR=/private/home/oceanntwt/contriever/encoded-data/bert-base-multilingual-cased/
18 | TRAINDATASETS="${TDIR}fr_XX ${TDIR}en_XX ${TDIR}ar_AR ${TDIR}bn_IN ${TDIR}fi_FI ${TDIR}id_ID ${TDIR}ja_XX ${TDIR}ko_KR ${TDIR}ru_RU ${TDIR}sw_KE ${TDIR}hu_HU ${TDIR}he_IL ${TDIR}it_IT ${TDIR}km_KM ${TDIR}ms_MY ${TDIR}nl_XX ${TDIR}no_XX ${TDIR}pl_PL ${TDIR}pt_XX ${TDIR}sv_SE ${TDIR}te_IN ${TDIR}th_TH ${TDIR}tr_TR ${TDIR}vi_VN ${TDIR}zh_CN ${TDIR}zh_TW ${TDIR}es_XX ${TDIR}de_DE ${TDIR}da_DK"
19 | 
20 | rmin=0.1
21 | rmax=0.5
22 | T=0.05
23 | QSIZE=32768
24 | MOM=0.999
25 | POOL=average
26 | AUG=none
27 | PAUG=0.
28 | LC=0.
29 | mo=bert-base-multilingual-cased
30 | mp=none
31 | 
32 | name=$SLURM_JOB_ID-$POOL-rmin$rmin-rmax$rmax-T$T-$QSIZE-$MOM-$mo-$AUG-$PAUG
33 | 
34 | srun ~oceanntwt/anaconda3/envs/pytorch10/bin/python3 ~oceanntwt/contriever/train.py \
35 |         --model_path $mp \
36 |         --sampling_coefficient $LC \
37 |         --augmentation $AUG --prob_augmentation $PAUG \
38 |         --retriever_model_id $mo --pooling $POOL \
39 |         --train_data $TRAINDATASETS --loading_mode split \
40 |         --ratio_min $rmin --ratio_max $rmax --chunk_length 256 \
41 |         --momentum $MOM --queue_size $QSIZE --temperature $T \
42 |         --warmup_steps 20000 --total_steps 500000 --lr 0.00005 \
43 |         --name $name \
44 |         --scheduler linear \
45 |         --optim adamw \
46 |         --per_gpu_batch_size 64 \
47 |         --output_dir /checkpoint/oceanntwt/contriever/xling/$name \
48 |         --main_port $port \
49 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/stream_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers to support streaming generate output.
 3 | Borrowed from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/callbacks.py
 4 | """
 5 | import traceback
 6 | from queue import Queue
 7 | from threading import Thread
 8 | 
 9 | import transformers
10 | 
11 | 
12 | class Stream(transformers.StoppingCriteria):
13 |     def __init__(self, callback_func=None):
14 |         self.callback_func = callback_func
15 | 
16 |     def __call__(self, input_ids, scores) -> bool:
17 |         if self.callback_func is not None:
18 |             self.callback_func(input_ids[0])
19 |         return False
20 | 
21 | 
22 | class Iteratorize:
23 |     """
24 |     Transforms a function that takes a callback
25 |     into a lazy iterator (generator).
26 |     """
27 |     def __init__(self, func, kwargs={}, callback=None):
28 |         self.mfunc = func
29 |         self.c_callback = callback
30 |         self.q = Queue()
31 |         self.sentinel = object()
32 |         self.kwargs = kwargs
33 |         self.stop_now = False
34 | 
35 |         def _callback(val):
36 |             if self.stop_now:
37 |                 raise ValueError
38 |             self.q.put(val)
39 | 
40 |         def gentask():
41 |             try:
42 |                 ret = self.mfunc(callback=_callback, **self.kwargs)
43 |             except ValueError:
44 |                 pass
45 |             except:
46 |                 traceback.print_exc()
47 |                 pass
48 | 
49 |             self.q.put(self.sentinel)
50 |             if self.c_callback:
51 |                 self.c_callback(ret)
52 | 
53 |         self.thread = Thread(target=gentask)
54 |         self.thread.start()
55 | 
56 |     def __iter__(self):
57 |         return self
58 | 
59 |     def __next__(self):
60 |         obj = self.q.get(True, None)
61 |         if obj is self.sentinel:
62 |             raise StopIteration
63 |         else:
64 |             return obj
65 | 
66 |     def __enter__(self):
67 |         return self
68 | 
69 |     def __exit__(self, exc_type, exc_val, exc_tb):
70 |         self.stop_now = True
71 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/data_loader.py:
--------------------------------------------------------------------------------
 1 | from transformers.tokenization_utils import PreTrainedTokenizer
 2 | 
 3 | from .conv_dataset import ConversationDataset, VicunaDataset
 4 | from .data_utils import make_data_module
 5 | from .sft_dataset import (DataCollatorForSupervisedDataset,
 6 |                           SFTInstructionDataset)
 7 | 
 8 | 
 9 | def make_supervised_data_module(tokenizer: PreTrainedTokenizer, args):
10 |     train_dataset, eval_dataset, multi_turn = make_data_module(args)
11 |     max_seq_length = tokenizer.model_max_length
12 |     dataset_cls = (VicunaDataset if args.conversation_template == 'vicnua' else
13 |                    ConversationDataset)
14 | 
15 |     if not multi_turn:
16 |         train_dataset = SFTInstructionDataset(
17 |             train_dataset,
18 |             tokenizer=tokenizer,
19 |             max_seq_len=max_seq_length,
20 |         ) if args.do_train else None
21 | 
22 |         eval_dataset = SFTInstructionDataset(
23 |             eval_dataset,
24 |             tokenizer=tokenizer,
25 |             max_seq_len=max_seq_length,
26 |         ) if args.do_eval else None
27 | 
28 |     else:
29 |         train_dataset = dataset_cls(
30 |             train_dataset,
31 |             tokenizer=tokenizer,
32 |             max_seq_length=max_seq_length,
33 |         ) if args.do_train else None
34 |         eval_dataset = dataset_cls(
35 |             eval_dataset,
36 |             tokenizer=tokenizer,
37 |             max_seq_length=max_seq_length,
38 |         ) if args.do_eval else None
39 | 
40 |     print(
41 |         f'train_dataset: {type(train_dataset)}, mutlti-turn: {multi_turn},  #length: {len(train_dataset)}'
42 |     ) if args.do_train else None
43 |     print(
44 |         f'eval_dataset: {type(eval_dataset)}, mutlti-turn: {multi_turn}, #length: {len(eval_dataset)}'
45 |     ) if args.do_eval else None
46 | 
47 |     print('Adding data collator: ', DataCollatorForSupervisedDataset)
48 |     data_collator = DataCollatorForSupervisedDataset(
49 |         tokenizer=tokenizer, predict_with_generate=args.predict_with_generate)
50 | 
51 |     return {
52 |         'train_dataset': train_dataset,
53 |         'eval_dataset': eval_dataset,
54 |         'data_collator': data_collator
55 |     }
56 | 


--------------------------------------------------------------------------------
/source/model/flan-t5/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import datasets
 3 | 
 4 | from datasets import load_dataset, concatenate_datasets, DatasetDict
 5 | from transformers import AutoTokenizer
 6 | from sklearn.model_selection import train_test_split
 7 | from typing import List, Union
 8 | 
 9 | 
10 | def clean_text(
11 |     texts: List[Union[str, None]], labels: List[Union[str, None]]
12 | ) -> pd.DataFrame:
13 |     """
14 |     The News Group dataset needs to be preprocessed as it has a lot of
15 |     entries with NULL text and/or NULL labels.
16 |     In this function we simply filter out the NULL entries, and
17 |     return a new dataframe with clean texts and labels.
18 |     """
19 |     new_texts, new_labels = [], []
20 |     for text, label in zip(texts, labels):
21 |         if isinstance(text, str) and isinstance(label, str):
22 |             new_texts.append(text)
23 |             new_labels.append(label)
24 |     new_ids = [i for i in range(len(new_texts))]
25 |     df = pd.DataFrame(data={"id": new_ids, "text": new_texts, "label": new_labels})
26 | 
27 |     return df
28 | 
29 | def get_data(tokenizer: AutoTokenizer) -> List[Union[DatasetDict, int, int]]:
30 |     dataset_id = "nq_open"
31 |     # Load dataset from the hub
32 |     dataset = load_dataset(dataset_id)
33 | 
34 |     print(f"Train dataset size: {len(dataset['train'])}")
35 |     print(f"Test dataset size: {len(dataset['validation'])}") # if validate
36 | 
37 |     tokenized_inputs = concatenate_datasets([dataset["train"]]).map(
38 |         lambda x: tokenizer(x["question"], truncation=True),
39 |         batched=True,
40 |         remove_columns=["question", "answer"],
41 |     )
42 | 
43 |     max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
44 |     print(f"Max source length: {max_source_length}")
45 | 
46 |     tokenized_targets = concatenate_datasets([dataset["train"], dataset["validation"]]).map(
47 |         lambda x: tokenizer(x["answer"], truncation=True),
48 |         batched=True,
49 |         remove_columns=["question", "answer"],
50 |     )
51 | 
52 |     max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
53 |     print(f"Max target length: {max_target_length}")
54 | 
55 |     return dataset, max_source_length, max_target_length
56 | 


--------------------------------------------------------------------------------
/retrieval_contriever/evaluate_retrieved_passages.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import argparse
 8 | import json
 9 | import logging
10 | import glob
11 | 
12 | import numpy as np
13 | import torch
14 | 
15 | import src.utils
16 | 
17 | from src.evaluation import calculate_matches
18 | 
19 | logger = logging.getLogger(__name__)
20 | 
21 | def validate(data, workers_num):
22 |     match_stats = calculate_matches(data, workers_num)
23 |     top_k_hits = match_stats.top_k_hits
24 | 
25 |     #logger.info('Validation results: top k documents hits %s', top_k_hits)
26 |     top_k_hits = [v / len(data) for v in top_k_hits]
27 |     #logger.info('Validation results: top k documents hits accuracy %s', top_k_hits)
28 |     return top_k_hits
29 | 
30 | 
31 | def main(opt):
32 |     logger = src.utils.init_logger(opt, stdout_only=True)
33 |     datapaths = glob.glob(args.data)
34 |     r20, r100 = [], []
35 |     for path in datapaths:
36 |         data = []
37 |         with open(path, 'r') as fin:
38 |             for line in fin:
39 |                 data.append(json.loads(line))
40 |             #data = json.load(fin)
41 |         answers = [ex['answers'] for ex in data]
42 |         top_k_hits = validate(data, args.validation_workers)
43 |         message = f"Evaluate results from {path}:"
44 |         for k in [5, 10, 20, 100]:
45 |             if k <= len(top_k_hits):
46 |                 recall = 100 * top_k_hits[k-1]
47 |                 if k == 20:
48 |                     r20.append(f"{recall:.1f}")
49 |                 if k == 100:
50 |                     r100.append(f"{recall:.1f}")
51 |                 message += f' R@{k}: {recall:.1f}'
52 |         logger.info(message)
53 |     print(datapaths)
54 |     print('\t'.join(r20))
55 |     print('\t'.join(r100))
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     parser = argparse.ArgumentParser()
60 | 
61 |     parser.add_argument('--data', required=True, type=str, default=None)
62 |     parser.add_argument('--validation_workers', type=int, default=16,
63 |                         help="Number of parallel processes to validate results")
64 | 
65 |     args = parser.parse_args()
66 |     main(args)
67 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def parse_arguments():
 5 |     parser = argparse.ArgumentParser()
 6 | 
 7 |     parser.add_argument(
 8 |         "--iteration_max_time", type=int, default=3, help="maxinum iteration in RA-iSF."
 9 |     )
10 |     parser.add_argument(
11 |         "--temperature", type=float, default=0, help=""
12 |     )
13 |     parser.add_argument(
14 |         "--max_length", type=int, default=256, help="maxinum generation of base model"
15 |     )
16 |     parser.add_argument(
17 |         "--type_list_file", default="./src/format/entity_type_list.txt", type=str, help='file path'
18 |     )
19 |     parser.add_argument(
20 |         "--prompt_id", default='324', help='string'
21 |     )
22 |     parser.add_argument(
23 |         "--infer_num", default='5', help='string'
24 |     )
25 |     parser.add_argument(
26 |         "--engine", default='llama2-13b', help="llama2-7b, llama2-13b, gpt-3.5",
27 |         choices=["llama2-7b", "llama2-13b", "gpt-3.5"]
28 |     )
29 |     parser.add_argument(
30 |         "--api_key", default="", help="gpt3.5 api key"
31 |     )
32 |     parser.add_argument(
33 |         "--base_model_path", default='/root/autodl-tmp/llama-7b-hf', help="your local model path"
34 |     )
35 |     parser.add_argument(
36 |         "--self_knowledge_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel self-knowledge path"
37 |     )
38 |     parser.add_argument(
39 |         "--passage_relevance_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel passage_relevance path"
40 |     )
41 |     parser.add_argument(
42 |         "--task_decomposition_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel task_decomposition path"
43 |     )
44 |     parser.add_argument(
45 |         "--data_path", default='/root/workspace/ra-isf/dataset/natural_question/nq_open.json', help="your local data path"
46 |     )
47 |     parser.add_argument(
48 |         "--output_path", default='/root/workspace/ra-isf/output/output.json', help="your local output file data path"
49 |     )
50 |     parser.add_argument(
51 |         "--test_start", default='0', help='string, number'
52 |     )
53 |     parser.add_argument(
54 |         "--test_end", default='full', help='string, number'
55 |     )
56 |     parsed_args = parser.parse_args()
57 |     return parsed_args
58 | 
59 | 
60 | args = parse_arguments()
61 | 


--------------------------------------------------------------------------------
/contriever_config.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def parse_retriever_arguments():
 4 |     parser = argparse.ArgumentParser()
 5 | 
 6 |     parser.add_argument(
 7 |         "--data",
 8 |         # required=True,
 9 |         type=str,
10 |         default=None,
11 |         help=".json file containing question and answers, similar format to reader data",
12 |     )
13 |     parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
14 |     parser.add_argument("--passages_embeddings", type=str, default=None, help="Glob path to encoded passages")
15 |     parser.add_argument(
16 |         "--output_dir", type=str, default=None, help="Results are written to outputdir with data suffix"
17 |     )
18 |     parser.add_argument("--n_docs", type=int, default=100, help="Number of documents to retrieve per questions")
19 |     parser.add_argument(
20 |         "--validation_workers", type=int, default=32, help="Number of parallel processes to validate results"
21 |     )
22 |     parser.add_argument("--per_gpu_batch_size", type=int, default=64, help="Batch size for question encoding")
23 |     parser.add_argument(
24 |         "--save_or_load_index", action="store_true", help="If enabled, save index and load index if it exists"
25 |     )
26 |     parser.add_argument(
27 |         "--model_name_or_path", type=str, default="/root/autodl-tmp/contriever-msmarco", help="path to directory containing model weights and config file"
28 |     )
29 |     parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
30 |     parser.add_argument("--question_maxlength", type=int, default=512, help="Maximum number of tokens in a question")
31 |     parser.add_argument(
32 |         "--indexing_batch_size", type=int, default=1000000, help="Batch size of the number of passages indexed"
33 |     )
34 |     parser.add_argument("--projection_size", type=int, default=768)
35 |     parser.add_argument(
36 |         "--n_subquantizers",
37 |         type=int,
38 |         default=0,
39 |         help="Number of subquantizer used for vector quantization, if 0 flat index is used",
40 |     )
41 |     parser.add_argument("--n_bits", type=int, default=8, help="Number of bits per subquantizer")
42 |     parser.add_argument("--lang", nargs="+")
43 |     parser.add_argument("--dataset", type=str, default="none")
44 |     parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
45 |     parser.add_argument("--normalize_text", action="store_true", help="normalize text")
46 |     parsed_args = parser.parse_args()
47 |     return parsed_args
48 | 
49 | c_args = parse_retriever_arguments()


--------------------------------------------------------------------------------
/retrieval_contriever/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | 
 3 | import os
 4 | import argparse
 5 | import torch
 6 | 
 7 | import transformers
 8 | from src.normalize_text import normalize
 9 | 
10 | 
11 | def save(tensor, split_path):
12 |     if not os.path.exists(os.path.dirname(split_path)):
13 |         os.makedirs(os.path.dirname(split_path))
14 |     with open(split_path, 'wb') as fout:
15 |         torch.save(tensor, fout)
16 | 
17 | def apply_tokenizer(path, tokenizer, normalize_text=False):
18 |     alltokens = []
19 |     lines = []
20 |     with open(path, "r", encoding="utf-8") as fin:
21 |         for k, line in enumerate(fin):
22 |             if normalize_text:
23 |                 line = normalize(line)
24 | 
25 |             lines.append(line)
26 |             if len(lines) > 1000000:
27 |                 tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids']
28 |                 tokens = [torch.tensor(x, dtype=torch.int) for x in tokens]
29 |                 alltokens.extend(tokens)
30 |                 lines = []
31 | 
32 |     tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids']
33 |     tokens = [torch.tensor(x, dtype=torch.int) for x in tokens]
34 |     alltokens.extend(tokens)
35 | 
36 |     alltokens = torch.cat(alltokens)
37 |     return alltokens
38 | 
39 | def tokenize_file(args):
40 |     filename = os.path.basename(args.datapath)
41 |     savepath = os.path.join(args.outdir, f"{filename}.pkl") 
42 |     if os.path.exists(savepath):
43 |         if args.overwrite:
44 |             print(f"File {savepath} already exists, overwriting")
45 |         else:
46 |             print(f"File {savepath} already exists, exiting")
47 |             return
48 |     try:
49 |         tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=True)
50 |     except:
51 |         tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=False)
52 |     print(f"Encoding {args.datapath}...")
53 |     tokens = apply_tokenizer(args.datapath, tokenizer, normalize_text=args.normalize_text)
54 | 
55 |     print(f"Saving at {savepath}...")
56 |     save(tokens, savepath)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
61 |     parser.add_argument("--datapath", type=str)
62 |     parser.add_argument("--outdir", type=str)
63 |     parser.add_argument("--tokenizer", type=str)
64 |     parser.add_argument("--overwrite", action="store_true")
65 |     parser.add_argument("--normalize_text", action="store_true")
66 | 
67 |     args, _ = parser.parse_known_args()
68 |     tokenize_file(args)
69 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/vllm/apil_chient.py:
--------------------------------------------------------------------------------
 1 | """Example Python client for vllm.entrypoints.api_server"""
 2 | 
 3 | import argparse
 4 | import json
 5 | from typing import Iterable, List
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def clear_line(n: int = 1) -> None:
11 |     LINE_UP = '\033[1A'
12 |     LINE_CLEAR = '\x1b[2K'
13 |     for _ in range(n):
14 |         print(LINE_UP, end=LINE_CLEAR, flush=True)
15 | 
16 | 
17 | def post_http_request(prompt: str,
18 |                       api_url: str,
19 |                       n: int = 1,
20 |                       stream: bool = False) -> requests.Response:
21 |     headers = {'User-Agent': 'Test Client'}
22 |     pload = {
23 |         'prompt': prompt,
24 |         'n': n,
25 |         'use_beam_search': True,
26 |         'temperature': 0.0,
27 |         'max_tokens': 16,
28 |         'stream': stream,
29 |     }
30 |     response = requests.post(api_url, headers=headers, json=pload, stream=True)
31 |     return response
32 | 
33 | 
34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
35 |     for chunk in response.iter_lines(chunk_size=8192,
36 |                                      decode_unicode=False,
37 |                                      delimiter=b'\0'):
38 |         if chunk:
39 |             data = json.loads(chunk.decode('utf-8'))
40 |             output = data['text']
41 |             yield output
42 | 
43 | 
44 | def get_response(response: requests.Response) -> List[str]:
45 |     data = json.loads(response.content)
46 |     output = data['text']
47 |     return output
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('--host', type=str, default='localhost')
53 |     parser.add_argument('--port', type=int, default=8000)
54 |     parser.add_argument('--n', type=int, default=4)
55 |     parser.add_argument('--prompt', type=str, default='San Francisco is a')
56 |     parser.add_argument('--stream', action='store_true')
57 |     args = parser.parse_args()
58 |     prompt = args.prompt
59 |     api_url = f'http://{args.host}:{args.port}/generate'
60 |     n = args.n
61 |     stream = args.stream
62 | 
63 |     print(f'Prompt: {prompt!r}\n', flush=True)
64 |     response = post_http_request(prompt, api_url, n, stream)
65 | 
66 |     if stream:
67 |         num_printed_lines = 0
68 |         for h in get_streaming_response(response):
69 |             clear_line(num_printed_lines)
70 |             num_printed_lines = 0
71 |             for i, line in enumerate(h):
72 |                 num_printed_lines += 1
73 |                 print(f'Beam candidate {i}: {line!r}', flush=True)
74 |     else:
75 |         output = get_response(response)
76 |         for i, line in enumerate(output):
77 |             print(f'Beam candidate {i}: {line!r}', flush=True)
78 | 


--------------------------------------------------------------------------------
/source/model/llama2/scripts/clean_data.sh:
--------------------------------------------------------------------------------
 1 | # sharegpt
 2 | python clean_sharegpt.py \
 3 |     --in-file /userhome/jianzhnie/prompt_data/anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json \
 4 |     --out-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_clean.json
 5 | 
 6 | python split_long_conversation.py \
 7 |     --in-file  /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_clean.json \
 8 |     --out-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_split.json \
 9 |     --model-name-or-path /userhome/jianzhnie/checkpoints/llama7b
10 | 
11 | python clean_evol_instruct.py \
12 |     --in-file /userhome/jianzhnie/prompt_data/WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json \
13 |     --out-file /userhome/jianzhnie/prompt_data/sharegpt/evol_instruct_clean.json
14 | 
15 | python merge.py \
16 |     --in-file  /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_split.json  /userhome/jianzhnie/prompt_data/sharegpt/evol_instruct_clean.json \
17 |     --out-file /userhome/jianzhnie/prompt_data/sharegpt/evol_sharegpt_merge.json
18 | 
19 | # chinese data
20 | python chatllms/data/utils/convert_alpaca.py \
21 |     --in-file ./prompt_data/chinese_data/alpaca_data_zh_51k.json \
22 |     --out-file ./prompt_data/chinese_data/alpaca_vicuna.json
23 | 
24 | python chatllms/data/utils/convert_alpaca.py \
25 |     --in-file ./prompt_data/InstructionWild/instinwild_ch.json \
26 |     --out-file ./prompt_data/chinese_data/instinwild_ch_vicuna.json
27 | 
28 | python chatllms/data/utils/convert_alpaca.py \
29 |     --in-file ./prompt_data/InstructionWild/instinwild_en.json \
30 |     --out-file ./prompt_data/chinese_data/instinwild_en_vicuna.json
31 | 
32 | python chatllms/data/utils/convert_alpaca.py \
33 |     --in-file ./prompt_data/databricks-dolly-15k/databricks-dolly-15k.jsonl \
34 |     --out-file ./prompt_data/chinese_data/dolly-15k_vicuna.json
35 | 
36 | python merge.py \
37 |     --in-file  /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/alpaca_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/dolly-15k_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/instinwild_ch_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/instinwild_en_vicuna.json  /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/olcc.json\
38 |     --out-file /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/vicuna_merge.json
39 | 
40 | 
41 | #  belle-group
42 | python chatllms/data/utils/convert_alpaca.py \
43 |     --in-file ./prompt_data/belle_group/generated_chat_0.4M/generated_chat_0.4M.json \
44 |     --out-file ./prompt_data/belle_group/generated_chat_vicuna.json
45 | 
46 | 
47 | python chatllms/data/utils/convert_alpaca.py \
48 |     --in-file ./prompt_data/belle_group/school_math_0.25M/school_math_0.25M.json \
49 |     --out-file ./prompt_data/belle_group/school_math_vicuna.json
50 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/test_convdataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | import numpy as np
 5 | 
 6 | sys.path.append('../')
 7 | from typing import Any, Dict
 8 | 
 9 | import transformers
10 | 
11 | from chatllms.data.conv_dataset import ConversationDataset, UltraChatDataset
12 | from chatllms.data.data_utils import (DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN,
13 |                                       DEFAULT_PAD_TOKEN, DEFAULT_UNK_TOKEN)
14 | 
15 | if __name__ == '__main__':
16 |     # Load the raw data from the specified data_path
17 |     data_path = '/home/robin/work_dir/llm/FastChat/data/dummy_conversation.json'
18 |     with open(data_path, 'r') as file:
19 |         raw_data = json.load(file)
20 | 
21 |     model_name_or_path = '/home/robin/checkpoints/baichuan7b'
22 |     model_name_or_path = 'facebook/opt-125m'
23 |     sources = [example['conversations'] for example in raw_data]
24 |     tokenizer = transformers.AutoTokenizer.from_pretrained(
25 |         model_name_or_path,
26 |         model_max_length=64,
27 |         padding_side='right',
28 |         use_fast=False,
29 |         add_special_tokens=False,
30 |         tokenizer_type='llama' if 'llama' in model_name_or_path else 'gpt2',
31 |     )
32 |     # Define a dictionary to store any missing special tokens along with their default values
33 |     special_tokens_dict: Dict[str, Any] = {}
34 | 
35 |     # Check if each special token is present. If not, add it to the special_tokens_dict with its default value.
36 |     if tokenizer.pad_token is None:
37 |         special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
38 |     if tokenizer.eos_token is None:
39 |         special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
40 |     if tokenizer.bos_token is None:
41 |         special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
42 |     if tokenizer.unk_token is None:
43 |         special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
44 | 
45 |     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
46 | 
47 |     print(tokenizer.bos_token)
48 |     # # Apply the conversation function to the raw data
49 |     dataset = ConversationDataset(sources, tokenizer, 64)
50 | 
51 |     for idx, data in enumerate(dataset):
52 |         print('==' * 10)
53 |         input_ids = data['input_ids']
54 |         input_txt = tokenizer.decode(input_ids)
55 |         print(input_txt)
56 |         targets = data['labels']
57 |         input_ids = np.array(input_ids)
58 |         target_text = tokenizer.decode(targets)
59 |         print(target_text)
60 |         if idx > 10:
61 |             break
62 | 
63 |     dataset = UltraChatDataset(sources, tokenizer, 128)
64 |     for idx, data in enumerate(dataset):
65 |         input_ids = data['input_ids']
66 |         labels = data['labels']
67 |         input_txt = tokenizer.decode(input_ids)
68 |         target_text = tokenizer.decode(labels)
69 |         print(input_txt)
70 |         print(target_text)
71 |         if idx > 10:
72 |             break
73 | 


--------------------------------------------------------------------------------
/source/model/llama2/server/multi_chat.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from threading import Thread
 3 | 
 4 | import torch
 5 | import transformers
 6 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
 7 |                           TextIteratorStreamer)
 8 | 
 9 | sys.path.append('../')
10 | from chatllms.configs import GenerationArguments, ModelInferenceArguments
11 | from chatllms.utils.model_utils import get_logits_processor
12 | 
13 | 
14 | def main(model_server_args, generation_args):
15 |     """
16 |     多轮对话，不具有对话历史的记忆功能
17 |     """
18 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
19 |     model = AutoModelForCausalLM.from_pretrained(
20 |         model_server_args.model_name_or_path,
21 |         cache_dir=model_server_args.cache_dir,
22 |         trust_remote_code=True,
23 |         low_cpu_mem_usage=True,
24 |         torch_dtype=torch.float16,
25 |         device_map='auto').to(device).eval()
26 |     tokenizer = AutoTokenizer.from_pretrained(
27 |         model_server_args.model_name_or_path,
28 |         trust_remote_code=True,
29 |         use_fast=False,
30 |     )
31 |     # 记录所有历史记录
32 |     historys = tokenizer.bos_token
33 |     print('User: ', end='', flush=True)
34 |     user_input = input('')
35 |     while True:
36 |         user_input = '{}</s>'.format(user_input).strip()
37 |         historys = historys + user_input
38 |         inputs = tokenizer(historys,
39 |                            return_tensors='pt',
40 |                            add_special_tokens=False)
41 |         inputs = {k: v.to(model.device) for k, v in inputs.items()}
42 | 
43 |         # Create a TextIteratorStreamer object to stream the response from the model
44 |         streamer = TextIteratorStreamer(tokenizer,
45 |                                         timeout=60.0,
46 |                                         skip_prompt=True,
47 |                                         skip_special_tokens=True)
48 | 
49 |         # Set the arguments for the model's generate() method
50 |         gen_kwargs = dict(
51 |             inputs,
52 |             streamer=streamer,
53 |             logits_processor=get_logits_processor(),
54 |             **generation_args.to_dict(),
55 |         )
56 | 
57 |         # Start a separate thread to generate the response asynchronously
58 |         thread = Thread(target=model.generate, kwargs=gen_kwargs)
59 |         thread.start()
60 | 
61 |         # Print the model name and the response as it is generated
62 |         print('Assistant: ', end='', flush=True)
63 |         response = ''
64 |         for new_text in streamer:
65 |             print(new_text, end='', flush=True)
66 |             response += new_text
67 | 
68 |         historys = historys + response
69 |         print('\n')
70 |         print('User: ', end='', flush=True)
71 |         user_input = input('')
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     parser = transformers.HfArgumentParser(
76 |         (ModelInferenceArguments, GenerationArguments))
77 |     model_server_args, generation_args = parser.parse_args_into_dataclasses()
78 |     main(model_server_args, generation_args)
79 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/convert_oasst1.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import random
 5 | 
 6 | 
 7 | def json_dump(obj, path):
 8 |     with open(path, 'w', encoding='utf-8') as f:
 9 |         json.dump(obj, f, indent=2, ensure_ascii=False)
10 | 
11 | 
12 | def json_load(in_file):
13 |     with open(in_file, 'r') as f:
14 |         json_data = json.load(f)
15 |     return json_data
16 | 
17 | 
18 | def convert_oasst1_data(data_dir, output_dir):
19 |     '''
20 |     For OASST1, because it's in a tree structure, where every user input might get multiple replies,
21 |     we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
22 |     This results in some of the messages being duplicated among different paths (instances).
23 |     Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
24 |     '''
25 |     conversations = []
26 |     with open(os.path.join(data_dir, '2023-04-12_oasst_ready.trees.jsonl'),
27 |               'r') as fin:
28 |         for line in fin:
29 |             conversations.append(json.loads(line))
30 | 
31 |     output_path = os.path.join(output_dir, 'oasst1_data.jsonl')
32 | 
33 |     # tranvers the conversation tree, and collect all valid sequences
34 |     def dfs(reply, messages, valid_sequences):
35 |         if reply['role'] == 'assistant':
36 |             messages.append({'role': 'assistant', 'content': reply['text']})
37 |             valid_sequences.append(messages[:])
38 |             for child in reply['replies']:
39 |                 dfs(child, messages, valid_sequences)
40 |             messages.pop()
41 |         elif reply['role'] == 'prompter':
42 |             messages.append({'role': 'user', 'content': reply['text']})
43 |             for child in reply['replies']:
44 |                 dfs(child, messages, valid_sequences)
45 |             messages.pop()
46 |         else:
47 |             raise ValueError(f"Unknown role: {reply['role']}")
48 | 
49 |     with open(output_path, 'w') as fout:
50 |         example_cnt = 0
51 |         for _, conversation in enumerate(conversations):
52 |             valid_sequences = []
53 |             dfs(conversation['prompt'], [], valid_sequences)
54 |             for sequence in valid_sequences:
55 |                 fout.write(
56 |                     json.dumps({
57 |                         'dataset': 'oasst1',
58 |                         'id': f'oasst1_{example_cnt}',
59 |                         'messages': sequence
60 |                     }) + '\n')
61 |                 example_cnt += 1
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     arg_parser = argparse.ArgumentParser()
66 |     arg_parser.add_argument('--raw_data_dir',
67 |                             type=str,
68 |                             default='data/downloads')
69 |     arg_parser.add_argument('--output_dir', type=str, default='data/processed')
70 |     arg_parser.add_argument('--seed', type=int, default=42)
71 |     args = arg_parser.parse_args()
72 |     random.seed(args.seed)
73 | 
74 |     convert_oasst1_data(data_dir=args.raw_data_dir, output_dir=args.output_dir)
75 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/train_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from transformers import TrainingArguments
 5 | 
 6 | 
 7 | @dataclass
 8 | class TrainingArguments(TrainingArguments):
 9 |     # 缓存目录
10 |     cache_dir: Optional[str] = field(default=None)
11 |     # 不使用adapter进行全微调（不适用Lora或qlora？）
12 |     full_finetune: bool = field(
13 |         default=False,
14 |         metadata={'help': 'Finetune the entire model without adapters.'})
15 |     # 是否进行训练，那肯定是要的
16 |     do_train: bool = field(
17 |         default=True,
18 |         metadata={'help': 'To train or not to train, that is the question?'})
19 |     # 是否进行验证
20 |     do_eval: bool = field(
21 |         default=False,
22 |         metadata={'help': 'To train or not to train, that is the question?'})
23 |     # 是否使用MMLU评估
24 |     do_mmlu_eval: Optional[bool] = field(
25 |         default=False,
26 |         metadata={'help': 'Whether to run the MMLU evaluation.'})
27 |     # mmlu数据集的默认名称，`mmlu-zs` for zero-shot or `mmlu-fs` for few shot.
28 |     mmlu_dataset: Optional[str] = field(
29 |         default='mmlu-fs',
30 |         metadata={
31 |             'help':
32 |             'MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot.'
33 |         })
34 |     # mmlu数据集的默认分割，`eval` for evaluation or `test` for testing.
35 |     mmlu_split: Optional[str] = field(
36 |         default='eval', metadata={'help': 'The MMLU split to run on'})
37 |     # mmlu数据集的默认最大样本数量
38 |     max_mmlu_samples: Optional[int] = field(
39 |         default=None,
40 |         metadata={
41 |             'help':
42 |             'If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset.'
43 |         })
44 |     # mmlu数据集source文本的最大长度（是字符长度还是token长度，这个去代码中找线索吧）
45 |     mmlu_source_max_len: int = field(
46 |         default=2048,
47 |         metadata={'help': 'Maximum source sequence length for mmlu.'})
48 |     # 是否进行sample generation
49 |     sample_generate: bool = field(
50 |         default=False,
51 |         metadata={'help': 'If do sample generation on evaluation.'})
52 |     # 使用nvidia的分页机制优化器，可以在偶尔OOM的情况，让模型继续训练下去。
53 |     optim: str = field(default='paged_adamw_32bit',
54 |                        metadata={'help': 'The optimizer to be used'})
55 |     # 梯度截断因子
56 |     max_grad_norm: float = field(
57 |         default=0.3,
58 |         metadata={
59 |             'help':
60 |             'Gradient clipping max norm. This is tuned and works well for all models tested.'
61 |         })
62 |     # 梯度检查，设置为True，来减少显存占用。
63 |     # 显存这么紧张，肯定是要设置为 True，但是运行时间就会提升
64 |     gradient_checkpointing: bool = field(
65 |         default=True,
66 |         metadata={'help': 'Use gradient checkpointing. You want to use this.'})
67 |     predict_with_generate: bool = field(
68 |         default=False,
69 |         metadata={
70 |             'help':
71 |             'Group sequences into batches with same length. Saves memory and speeds up training considerably.'
72 |         })
73 |     model_max_length: int = field(
74 |         default=1024,
75 |         metadata={
76 |             'help':
77 |             'Maximum sequence length. Sequences will be right padded (and possibly truncated).'
78 |         },
79 |     )
80 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/utils/convert_alpaca.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convert alpaca dataset into sharegpt format.
  3 | 
  4 | Usage: python3 -m chatllms.data.convert_alpaca --in alpaca_data.json
  5 | """
  6 | 
  7 | import argparse
  8 | import json
  9 | from typing import Any, Dict, List
 10 | 
 11 | from datasets import load_dataset
 12 | 
 13 | 
 14 | def json_dump(obj, path):
 15 |     with open(path, 'w', encoding='utf-8') as f:
 16 |         json.dump(obj, f, indent=2, ensure_ascii=False)
 17 | 
 18 | 
 19 | def json_load(in_file):
 20 |     with open(in_file, 'r') as f:
 21 |         json_data = json.load(f)
 22 |     return json_data
 23 | 
 24 | 
 25 | def valid_keys(keys):
 26 |     for k in ['instruction', 'input', 'output']:
 27 |         if k not in keys:
 28 |             return False
 29 |     return True
 30 | 
 31 | 
 32 | def convert_alpaca_vicuna(raw_data: List[Dict[str, Any]]):
 33 |     collect_data = []
 34 |     for i, content in enumerate(raw_data):
 35 |         if not valid_keys(content.keys()):
 36 |             continue
 37 | 
 38 |         if len(content['input'].strip()) > 1:
 39 |             q, a = content['instruction'] + '\nInput:\n' + content[
 40 |                 'input'], content['output']
 41 |         else:
 42 |             q, a = content['instruction'], content['output']
 43 | 
 44 |         collect_data.append({
 45 |             'id':
 46 |             f'alpaca_{i}',
 47 |             'conversations': [
 48 |                 {
 49 |                     'from': 'human',
 50 |                     'value': q
 51 |                 },
 52 |                 {
 53 |                     'from': 'gpt',
 54 |                     'value': a
 55 |                 },
 56 |             ],
 57 |         })
 58 |     print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}')
 59 |     return collect_data
 60 | 
 61 | 
 62 | def convert_dolly_vicuna(raw_data: List[Dict[str, Any]]):
 63 |     collect_data = []
 64 |     for i, content in enumerate(raw_data):
 65 |         if len(content['context'].strip()) > 1:
 66 |             q, a = content['instruction'] + '\nInput:\n' + content[
 67 |                 'context'], content['response']
 68 |         else:
 69 |             q, a = content['instruction'], content['response']
 70 | 
 71 |         collect_data.append({
 72 |             'id':
 73 |             f'alpaca_{i}',
 74 |             'conversations': [
 75 |                 {
 76 |                     'from': 'human',
 77 |                     'value': q
 78 |                 },
 79 |                 {
 80 |                     'from': 'gpt',
 81 |                     'value': a
 82 |                 },
 83 |             ],
 84 |         })
 85 |     print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}')
 86 |     return collect_data
 87 | 
 88 | 
 89 | def main():
 90 |     parser = argparse.ArgumentParser()
 91 |     parser.add_argument('--in-file', type=str)
 92 |     parser.add_argument('--out-file', type=str)
 93 |     args = parser.parse_args()
 94 | 
 95 |     raw_data = load_dataset('json', data_files=args.in_file)['train']
 96 |     new_data = convert_alpaca_vicuna(raw_data)
 97 | 
 98 |     # new_data = convert_dolly_vicuna(raw_data)
 99 |     # new_data = convert_alpaca_vicuna(raw_data)
100 |     json_dump(new_data, args.out_file)
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/retrieval_contriever/src/index.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | # 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import os
 8 | import pickle
 9 | from typing import List, Tuple
10 | 
11 | import faiss
12 | import numpy as np
13 | from tqdm import tqdm
14 | 
15 | class Indexer(object):
16 | 
17 |     def __init__(self, vector_sz, n_subquantizers=0, n_bits=8):
18 |         if n_subquantizers > 0:
19 |             self.index = faiss.IndexPQ(vector_sz, n_subquantizers, n_bits, faiss.METRIC_INNER_PRODUCT)
20 |         else:
21 |             self.index = faiss.IndexFlatIP(vector_sz)
22 |         #self.index_id_to_db_id = np.empty((0), dtype=np.int64)
23 |         self.index_id_to_db_id = []
24 | 
25 |     def index_data(self, ids, embeddings):
26 |         self._update_id_mapping(ids)
27 |         embeddings = embeddings.astype('float32')
28 |         if not self.index.is_trained:
29 |             self.index.train(embeddings)
30 |         self.index.add(embeddings)
31 | 
32 |         print(f'Total data indexed {len(self.index_id_to_db_id)}')
33 | 
34 |     def search_knn(self, query_vectors: np.array, top_docs: int, index_batch_size: int = 2048) -> List[Tuple[List[object], List[float]]]:
35 |         query_vectors = query_vectors.astype('float32')
36 |         result = []
37 |         nbatch = (len(query_vectors)-1) // index_batch_size + 1
38 |         for k in tqdm(range(nbatch)):
39 |             start_idx = k*index_batch_size
40 |             end_idx = min((k+1)*index_batch_size, len(query_vectors))
41 |             q = query_vectors[start_idx: end_idx]
42 |             scores, indexes = self.index.search(q, top_docs)
43 |             # convert to external ids
44 |             db_ids = [[str(self.index_id_to_db_id[i]) for i in query_top_idxs] for query_top_idxs in indexes]
45 |             result.extend([(db_ids[i], scores[i]) for i in range(len(db_ids))])
46 |         return result
47 | 
48 |     def serialize(self, dir_path):
49 |         index_file = os.path.join(dir_path, 'index.faiss')
50 |         meta_file = os.path.join(dir_path, 'index_meta.faiss')
51 |         print(f'Serializing index to {index_file}, meta data to {meta_file}')
52 | 
53 |         faiss.write_index(self.index, index_file)
54 |         with open(meta_file, mode='wb') as f:
55 |             pickle.dump(self.index_id_to_db_id, f)
56 | 
57 |     def deserialize_from(self, dir_path):
58 |         index_file = os.path.join(dir_path, 'index.faiss')
59 |         meta_file = os.path.join(dir_path, 'index_meta.faiss')
60 |         print(f'Loading index from {index_file}, meta data from {meta_file}')
61 | 
62 |         self.index = faiss.read_index(index_file)
63 |         print('Loaded index of type %s and size %d', type(self.index), self.index.ntotal)
64 | 
65 |         with open(meta_file, "rb") as reader:
66 |             self.index_id_to_db_id = pickle.load(reader)
67 |         assert len(
68 |             self.index_id_to_db_id) == self.index.ntotal, 'Deserialized index_id_to_db_id should match faiss index size'
69 | 
70 |     def _update_id_mapping(self, db_ids: List):
71 |         #new_ids = np.array(db_ids, dtype=np.int64)
72 |         #self.index_id_to_db_id = np.concatenate((self.index_id_to_db_id, new_ids), axis=0)
73 |         self.index_id_to_db_id.extend(db_ids)


--------------------------------------------------------------------------------
/retrieval_contriever/src/inbatch.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import numpy as np
 6 | import math
 7 | import random
 8 | import transformers
 9 | import logging
10 | import torch.distributed as dist
11 | 
12 | from retrieval_contriever.src import contriever, dist_utils, utils
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class InBatch(nn.Module):
18 |     def __init__(self, opt, retriever=None, tokenizer=None):
19 |         super(InBatch, self).__init__()
20 | 
21 |         self.opt = opt
22 |         self.norm_doc = opt.norm_doc
23 |         self.norm_query = opt.norm_query
24 |         self.label_smoothing = opt.label_smoothing
25 |         if retriever is None or tokenizer is None:
26 |             retriever, tokenizer = self._load_retriever(
27 |                 opt.retriever_model_id, pooling=opt.pooling, random_init=opt.random_init
28 |             )
29 |         self.tokenizer = tokenizer
30 |         self.encoder = retriever
31 | 
32 |     def _load_retriever(self, model_id, pooling, random_init):
33 |         cfg = utils.load_hf(transformers.AutoConfig, model_id)
34 |         tokenizer = utils.load_hf(transformers.AutoTokenizer, model_id)
35 | 
36 |         if "xlm" in model_id:
37 |             model_class = contriever.XLMRetriever
38 |         else:
39 |             model_class = contriever.Contriever
40 | 
41 |         if random_init:
42 |             retriever = model_class(cfg)
43 |         else:
44 |             retriever = utils.load_hf(model_class, model_id)
45 | 
46 |         if "bert-" in model_id:
47 |             if tokenizer.bos_token_id is None:
48 |                 tokenizer.bos_token = "[CLS]"
49 |             if tokenizer.eos_token_id is None:
50 |                 tokenizer.eos_token = "[SEP]"
51 | 
52 |         retriever.config.pooling = pooling
53 | 
54 |         return retriever, tokenizer
55 | 
56 |     def get_encoder(self):
57 |         return self.encoder
58 | 
59 |     def forward(self, q_tokens, q_mask, k_tokens, k_mask, stats_prefix="", iter_stats={}, **kwargs):
60 | 
61 |         bsz = len(q_tokens)
62 |         labels = torch.arange(0, bsz, dtype=torch.long, device=q_tokens.device)
63 | 
64 |         qemb = self.encoder(input_ids=q_tokens, attention_mask=q_mask, normalize=self.norm_query)
65 |         kemb = self.encoder(input_ids=k_tokens, attention_mask=k_mask, normalize=self.norm_doc)
66 | 
67 |         gather_fn = dist_utils.gather
68 | 
69 |         gather_kemb = gather_fn(kemb)
70 | 
71 |         labels = labels + dist_utils.get_rank() * len(kemb)
72 | 
73 |         scores = torch.einsum("id, jd->ij", qemb / self.opt.temperature, gather_kemb)
74 | 
75 |         loss = torch.nn.functional.cross_entropy(scores, labels, label_smoothing=self.label_smoothing)
76 | 
77 |         # log stats
78 |         if len(stats_prefix) > 0:
79 |             stats_prefix = stats_prefix + "/"
80 |         iter_stats[f"{stats_prefix}loss"] = (loss.item(), bsz)
81 | 
82 |         predicted_idx = torch.argmax(scores, dim=-1)
83 |         accuracy = 100 * (predicted_idx == labels).float().mean()
84 |         stdq = torch.std(qemb, dim=0).mean().item()
85 |         stdk = torch.std(kemb, dim=0).mean().item()
86 |         iter_stats[f"{stats_prefix}accuracy"] = (accuracy, bsz)
87 |         iter_stats[f"{stats_prefix}stdq"] = (stdq, bsz)
88 |         iter_stats[f"{stats_prefix}stdk"] = (stdk, bsz)
89 | 
90 |         return loss, iter_stats
91 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/convert_vicuna.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | from datasets import load_dataset
 5 | 
 6 | sys.path.append('../../')
 7 | 
 8 | from chatllms.data.data_utils import extract_default_prompt_dataset
 9 | 
10 | 
11 | def json_dump(obj, path):
12 |     with open(path, 'w', encoding='utf-8') as f:
13 |         json.dump(obj, f, indent=2, ensure_ascii=False)
14 | 
15 | 
16 | def json_load(in_file):
17 |     with open(in_file, 'r') as f:
18 |         json_data = json.load(f)
19 |     return json_data
20 | 
21 | 
22 | def valid_keys(keys):
23 |     for k in ['input', 'output']:
24 |         if k not in keys:
25 |             return False
26 |     return True
27 | 
28 | 
29 | def remove_unused_columns(dataset):
30 |     """Remove columns not named 'input' or 'output'."""
31 |     dataset = dataset.remove_columns([
32 |         col for col in dataset.column_names if col not in ['input', 'output']
33 |     ])
34 |     return dataset
35 | 
36 | 
37 | def convert_alpaca_vicuna(in_file: str, out_file: str = None):
38 |     raw_dataset = load_dataset('json', data_files=in_file)['train']
39 |     raw_dataset = raw_dataset.map(extract_default_prompt_dataset)
40 | 
41 |     collect_data = []
42 |     for i, content in enumerate(raw_dataset):
43 |         prompt = content['input']
44 |         response = content['output']
45 | 
46 |         collect_data.append({
47 |             'id':
48 |             f'alpaca_{i}',
49 |             'conversations': [
50 |                 {
51 |                     'from': 'human',
52 |                     'value': prompt
53 |                 },
54 |                 {
55 |                     'from': 'gpt',
56 |                     'value': response
57 |                 },
58 |             ],
59 |         })
60 |     print(f'Original: {len(raw_dataset)}, Converted: {len(collect_data)}')
61 |     json_dump(collect_data, out_file)
62 |     return collect_data
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     in_file = '/home/robin/prompt_data/100PoisonMpts/train_alpaca.json'
67 |     out_file = '/home/robin/prompt_data/100PoisonMpts/train_vicuna.json'
68 |     collect_data = convert_alpaca_vicuna(in_file, out_file)
69 | 
70 |     data_path = '/home/robin/prompt_data/CValues-Comparison/test_alpaca.json'
71 |     out_path = '/home/robin/prompt_data/CValues-Comparison/test_vicuna.json'
72 |     convert_alpaca_vicuna(data_path, out_file=out_path)
73 | 
74 |     data_path = '/home/robin/prompt_data/CValues-Comparison/train_alpaca.json'
75 |     out_path = '/home/robin/prompt_data/CValues-Comparison/train_vicuna.json'
76 |     convert_alpaca_vicuna(data_path, out_file=out_path)
77 | 
78 |     data_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json'
79 |     out_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_vicnua.json'
80 |     convert_alpaca_vicuna(data_path, out_file=out_path)
81 | 
82 |     data_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json'
83 |     out_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json'
84 |     convert_alpaca_vicuna(data_path, out_file=out_path)
85 | 
86 |     data_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json'
87 |     out_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json'
88 |     convert_alpaca_vicuna(data_path, out_file=out_path)
89 | 
90 |     data_path = '/home/robin/prompt_data/COIG/train_alpaca.json'
91 |     out_path = '/home/robin/prompt_data/COIG/train_vicuna.json'
92 |     convert_alpaca_vicuna(data_path, out_file=out_path)
93 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/gen_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import asdict, dataclass, field
 2 | from typing import Any, Dict, Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class GenerationArguments:
 7 |     """
 8 |     Arguments pertaining to specify the model generation parameters.
 9 |     """
10 |     # generation parameters
11 |     # 是否使用cache
12 |     use_cache: Optional[bool] = field(default=True)
13 |     # Length arguments
14 |     # 最大的新生成的token数量
15 |     max_new_tokens: Optional[int] = field(
16 |         default=1024,
17 |         metadata={
18 |             'help':
19 |             'Maximum number of new tokens to be generated in evaluation or prediction loops'
20 |             'if predict_with_generate is set.'
21 |         })
22 |     # 最少的新生成的token数量
23 |     min_new_tokens: Optional[int] = field(
24 |         default=0,
25 |         metadata={'help': 'Minimum number of new tokens to generate.'})
26 |     # 最大的token数量，会被 max_new_tokens 覆盖
27 |     max_length: Optional[int] = field(
28 |         default=None,
29 |         metadata={
30 |             'help':
31 |             'The maximum length the generated tokens can have. It can be overridden by max_new_tokens.'
32 |         })
33 |     # Generation strategy
34 |     # 是否采样
35 |     do_sample: Optional[bool] = field(
36 |         default=True,
37 |         metadata={
38 |             'help':
39 |             'Whether or not to use sampling, use greedy decoding otherwise.'
40 |         })
41 |     # 集束搜索的数量
42 |     num_beams: Optional[int] = field(
43 |         default=1,
44 |         metadata={
45 |             'help': 'Number of beams for beam search. 1 means no beam search.'
46 |         })
47 |     # 集束搜索的组数量
48 |     num_beam_groups: Optional[int] = field(default=1)
49 |     # 惩罚因子
50 |     penalty_alpha: Optional[float] = field(default=None)
51 |     # Hyperparameters for logit manipulation
52 |     # softmax 函数的温度因子，来调节输出token的分布
53 |     temperature: Optional[float] = field(
54 |         default=1.0,
55 |         metadata={
56 |             'help': 'The value used to modulate the next token probabilities.'
57 |         })
58 |     # top_k随机搜索中的k个最高概率选择
59 |     top_k: Optional[int] = field(
60 |         default=50,
61 |         metadata={
62 |             'help':
63 |             'The number of highest probability vocabulary tokens to keep for top-k filtering.'
64 |         })
65 |     # 核采样参数，top_p最高的前n个（n是变化）概率和为p，从这些n个候选token中随机采样
66 |     top_p: Optional[float] = field(
67 |         default=1.0,
68 |         metadata={
69 |             'help':
70 |             'The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept.'
71 |         })
72 |     # 典型p值
73 |     typical_p: Optional[float] = field(default=1.0)
74 |     # 丰富性惩罚因子
75 |     diversity_penalty: Optional[float] = field(default=0.0)
76 |     # 重复性惩罚因子
77 |     repetition_penalty: Optional[float] = field(
78 |         default=1.0,
79 |         metadata={
80 |             'help':
81 |             'The parameter for repetition penalty. 1.0 means no penalty.'
82 |         })
83 |     # 长度惩罚因子
84 |     length_penalty: Optional[float] = field(
85 |         default=1.0,
86 |         metadata={
87 |             'help':
88 |             'Exponential penalty to the length that is used with beam-based generation.'
89 |         })
90 |     # 没有ngram重复的尺度大小
91 |     # 一般随机采样的丰富性够了，所以一般不会设置，如果重复很多则设置为2是比较好的选择
92 |     no_repeat_ngram_size: Optional[int] = field(default=0)
93 | 
94 |     def to_dict(self) -> Dict[str, Any]:
95 |         args = asdict(self)
96 |         if args.get('max_new_tokens', None):
97 |             args.pop('max_length', None)
98 |         return args
99 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/apply_lora.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Apply the LoRA weights on top of a base model.
 3 | 
 4 | Usage:
 5 | python3 apply_lora.py --base_model_path ~/model_weights/llama-7b --target_model_path ~/model_weights/baize-7b \
 6 |     --lora_path project-baize/baize-lora-7B
 7 | 
 8 | Dependency:
 9 | pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b
10 | """
11 | import argparse
12 | from typing import Tuple
13 | 
14 | import torch
15 | from peft import PeftModel
16 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
17 | 
18 | 
19 | def apply_lora(
20 |     base_model_path: str,
21 |     lora_model_path: str,
22 |     target_model_path: str = None,
23 |     cache_dir: str = None,
24 |     use_auth_token: str = True,
25 |     trust_remote_code: bool = True,
26 | ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
27 |     """Applies the LoRA adapter to a base model and saves the resulting target model (optional).
28 | 
29 |     Args:
30 |         base_model_path (str): The path to the base model to which the LoRA adapter will be applied.
31 |         lora_model_path (str): The path to the LoRA adapter.
32 |         target_model_path (str): The path where the target model will be saved (if `save_target_model=True`).
33 |         cache_dir (str): The path to the cache directory.
34 |         use_auth_token (bool): Whether to use an authentication token when downloading the model.
35 |         trust_remote_code (bool): Whether to trust remote code when downloading the model.
36 | 
37 |     Returns:
38 |         Tuple[AutoModelForCausalLM, AutoTokenizer]: A tuple containing the target model and its tokenizer.
39 | 
40 |     """
41 |     # Load the base model and tokenizer
42 |     print(f'Loading the base model from {base_model_path}')
43 |     # Set configuration kwargs for tokenizer.
44 |     config_kwargs = {
45 |         'cache_dir': cache_dir,
46 |         'use_auth_token': use_auth_token,
47 |         'trust_remote_code': trust_remote_code,
48 |     }
49 | 
50 |     base_model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
51 |         base_model_path,
52 |         device_map='auto',
53 |         torch_dtype=torch.float16,
54 |         low_cpu_mem_usage=True,
55 |         **config_kwargs,
56 |     )
57 | 
58 |     # Load the tokenizer
59 |     print(f'Loading the tokenizer from {base_model_path}')
60 |     # Due to the name of Transformers' LlamaTokenizer, we have to do this
61 |     tokenizer = AutoTokenizer.from_pretrained(
62 |         base_model_path,
63 |         use_fast=False,
64 |         **config_kwargs,
65 |     )
66 | 
67 |     # Load the LoRA adapter
68 |     print(f'Loading the LoRA adapter from {lora_model_path}')
69 |     model: PreTrainedModel = PeftModel.from_pretrained(base_model,
70 |                                                        lora_model_path)
71 |     print('Applying the LoRA to  base model')
72 |     model = model.merge_and_unload()
73 | 
74 |     if target_model_path is not None:
75 |         print(f'Saving the target model to {target_model_path}')
76 |         model.save_pretrained(target_model_path)
77 |         tokenizer.save_pretrained(target_model_path)
78 | 
79 |     return model, tokenizer
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     parser = argparse.ArgumentParser()
84 |     parser.add_argument('--base-model-path', type=str, required=True)
85 |     parser.add_argument('--target-model-path', type=str, default=None)
86 |     parser.add_argument('--lora-model-path', type=str, required=True)
87 |     args = parser.parse_args()
88 | 
89 |     apply_lora(base_model_path=args.base_model_path,
90 |                lora_model_path=args.lora_model_path,
91 |                target_model_path=args.target_model_path)
92 | 


--------------------------------------------------------------------------------
/retrieval_contriever/src/dist_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | 
  3 | import torch
  4 | import torch.distributed as dist
  5 | 
  6 | 
  7 | class Gather(torch.autograd.Function):
  8 |     @staticmethod
  9 |     def forward(ctx, x: torch.tensor):
 10 |         output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
 11 |         dist.all_gather(output, x)
 12 |         return tuple(output)
 13 | 
 14 |     @staticmethod
 15 |     def backward(ctx, *grads):
 16 |         all_gradients = torch.stack(grads)
 17 |         dist.all_reduce(all_gradients)
 18 |         return all_gradients[dist.get_rank()]
 19 | 
 20 | 
 21 | def gather(x: torch.tensor):
 22 |     if not dist.is_initialized():
 23 |         return x
 24 |     x_gather = Gather.apply(x)
 25 |     x_gather = torch.cat(x_gather, dim=0)
 26 |     return x_gather
 27 | 
 28 | 
 29 | @torch.no_grad()
 30 | def gather_nograd(x: torch.tensor):
 31 |     if not dist.is_initialized():
 32 |         return x
 33 |     x_gather = [torch.ones_like(x) for _ in range(dist.get_world_size())]
 34 |     dist.all_gather(x_gather, x, async_op=False)
 35 | 
 36 |     x_gather = torch.cat(x_gather, dim=0)
 37 |     return x_gather
 38 | 
 39 | 
 40 | @torch.no_grad()
 41 | def varsize_gather_nograd(x: torch.Tensor):
 42 |     """gather tensors of different sizes along the first dimension"""
 43 |     if not dist.is_initialized():
 44 |         return x
 45 | 
 46 |     # determine max size
 47 |     size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
 48 |     allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
 49 |     dist.all_gather(allsizes, size)
 50 |     max_size = max([size.cpu().max() for size in allsizes])
 51 | 
 52 |     padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device)
 53 |     padded[: x.shape[0]] = x
 54 |     output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())]
 55 |     dist.all_gather(output, padded)
 56 | 
 57 |     output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)]
 58 |     output = torch.cat(output, dim=0)
 59 | 
 60 |     return output
 61 | 
 62 | 
 63 | @torch.no_grad()
 64 | def get_varsize(x: torch.Tensor):
 65 |     """gather tensors of different sizes along the first dimension"""
 66 |     if not dist.is_initialized():
 67 |         return [x.shape[0]]
 68 | 
 69 |     # determine max size
 70 |     size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
 71 |     allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
 72 |     dist.all_gather(allsizes, size)
 73 |     allsizes = torch.cat(allsizes)
 74 |     return allsizes
 75 | 
 76 | 
 77 | def get_rank():
 78 |     if not dist.is_available():
 79 |         return 0
 80 |     if not dist.is_initialized():
 81 |         return 0
 82 |     return dist.get_rank()
 83 | 
 84 | 
 85 | def is_main():
 86 |     return get_rank() == 0
 87 | 
 88 | 
 89 | def get_world_size():
 90 |     if not dist.is_initialized():
 91 |         return 1
 92 |     else:
 93 |         return dist.get_world_size()
 94 | 
 95 | 
 96 | def barrier():
 97 |     if dist.is_initialized():
 98 |         dist.barrier()
 99 | 
100 | 
101 | def average_main(x):
102 |     if not dist.is_initialized():
103 |         return x
104 |     if dist.is_initialized() and dist.get_world_size() > 1:
105 |         dist.reduce(x, 0, op=dist.ReduceOp.SUM)
106 |         if is_main():
107 |             x = x / dist.get_world_size()
108 |     return x
109 | 
110 | 
111 | def sum_main(x):
112 |     if not dist.is_initialized():
113 |         return x
114 |     if dist.is_initialized() and dist.get_world_size() > 1:
115 |         dist.reduce(x, 0, op=dist.ReduceOp.SUM)
116 |     return x
117 | 
118 | 
119 | def weighted_average(x, count):
120 |     if not dist.is_initialized():
121 |         if isinstance(x, torch.Tensor):
122 |             x = x.item()
123 |         return x, count
124 |     t_loss = torch.tensor([x * count]).cuda()
125 |     t_total = torch.tensor([count]).cuda()
126 |     t_loss = sum_main(t_loss)
127 |     t_total = sum_main(t_total)
128 |     return (t_loss / t_total).item(), t_total.item()
129 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/sample_generate_callback.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from dataclasses import dataclass
 3 | from typing import Any, Dict
 4 | 
 5 | from transformers import PreTrainedTokenizer, TrainerCallback
 6 | 
 7 | 
 8 | @dataclass
 9 | class SampleGenerateCallback(TrainerCallback):
10 |     """
11 |     A callback that generates text samples from a pre-trained language model during training.
12 | 
13 |     Args:
14 |         tokenizer (PreTrainedTokenizer): The tokenizer used to preprocess inputs.
15 |         max_new_tokens (int): The maximum number of tokens to generate in response to each input.
16 |     """
17 |     def __init__(self, tokenizer: PreTrainedTokenizer,
18 |                  generation_config: argparse.Namespace, logger: None):
19 |         self.tokenizer = tokenizer
20 |         self.generation_config = generation_config
21 |         self.logger = logger
22 | 
23 |         # Define input prompts to generate text from
24 |         self.sample_inputs = [
25 |             '用一句话描述地球为什么是独一无二的。',
26 |             '中国是否应该推出刺激政策救楼市？',
27 |             '如何更好地融入新工作圈子',
28 |             '帮我把这段文字转换成鲁迅作品里的语气：昨天上午，算几个数学问题时越算越难受，有想要撕掉草稿纸的冲动思维也变得缓慢，见字忘意，感觉大脑里是一团浆糊，阻力很大。'
29 |             '我怀疑自己抑郁又犯了，站起身离开了书桌。走出大门，开始跑步，运动，希望能借此缓解。我不想再吃药，我担心不吃药是否能恢复。稍微运动后，大吃了一顿，路上不停的对自己说，我可以.',
30 |             '回来后，感觉似乎确实好一些。',
31 |             '给我写一篇大模型的新闻稿',
32 |             '你觉得人类哪些工作岗位会被AI替代？',
33 |             '请帮我写一封中式婚礼请帖，用于邀请亲朋好友参加我的婚礼！',
34 |             '帮我写一篇八百字以上的作文，主题是：当代青年面对时代的挑战如何肩负起民族复兴的伟大任务',
35 |             '请仿照李荣浩的风格写一首表现爱情的歌曲，以“辣椒酱”为题。',
36 |             '秦王朝时期十大将军是？其主要功绩是什么？',
37 |             '帮我写一段广告，关于房产销售的，我们的房子首付低，赠送面积大，还免两年物业费！',
38 |             '请帮我设计一个时长为3天的北京旅游行程，行程的内容不要太紧凑，使用地铁作为交通工具，并前往前门、天安门、天坛公园、鸟巢游览，同时预留一天的时间游玩环球影城。',
39 |             '一个笼子里面有若干只鸡和兔子，总共有50只脚和18个头，求鸡和兔子各有多少只？',
40 |             '生成一篇短篇小说，故事情节为一个年轻人在旅途中遇到了一位神秘的老人，老人告诉他一个令人意想不到的秘密，最终年轻人的生活因此发生了翻天覆地的变化。',
41 |             '导师想要我论文的一作，我应该怎么办？',
42 |             '我现在很无聊，可以讲点有趣的事情吗？',
43 |             '一项工程，甲、乙两队合作20天完成，乙丙两队合作60天完成，丙丁两队合作30完成，甲丁合作多少天完成?',
44 |             '如果一位孕妇走上了公交车，但是车上没有空位了。请模拟一位热心乘客给孕妇让座的对话。',
45 |             '桃花潭水深千尺，不及汪伦送我情。体现的是怎样的心情？',
46 |             '编写一个简单的自动化脚本，用于批量操作文件或目录。脚本功能可以自由选择，如复制、压缩、重命名、删除等。脚本语言可使用Python、Shell、Perl等，代码长度不少于100行。',
47 |             '音乐可以洗涤人的灵魂吗？',
48 |         ]
49 | 
50 |     def on_evaluate(self, args: Any, state: Dict[str, Any], control: Any,
51 |                     **kwargs: Any) -> None:
52 |         """
53 |         Generates text samples from the language model during evaluation.
54 | 
55 |         Args:
56 |             args (Any): Trainer arguments, not used in this method.
57 |             state (Dict[str, Any]): Trainer state dictionary, not used in this method.
58 |             control (Any): Trainer control object, not used in this method.
59 |             kwargs (Dict[str, Any]): Keyword arguments passed to the method, including the pre-trained
60 |                 language model (under the key 'model') and any additional parameters needed for generation.
61 | 
62 |         Returns:
63 |             None
64 |         """
65 |         logger = self.logger
66 |         logger.info('Generating sample text during evaluation...')
67 | 
68 |         # Check if the pre-trained language model is available
69 |         if 'model' in kwargs:
70 |             model = kwargs['model']
71 | 
72 |             # Generate text for each input prompt
73 |             for instruction in self.sample_inputs:
74 |                 # Preprocess input prompt and convert to tensor
75 |                 inputs = f'{instruction}\n\n### Response: '
76 |                 inputs = self.tokenizer(inputs, return_tensors='pt')
77 |                 inputs = inputs.to(model.device)
78 | 
79 |                 # Generate text from input prompt
80 |                 generation_output = model.generate(
81 |                     **inputs,
82 |                     generation_config=self.generation_config,
83 |                 )
84 | 
85 |                 # Decode generated text and log it
86 |                 generated_text = self.tokenizer.decode(generation_output[0])
87 |                 logger.info(f'Input prompt: {instruction}')
88 |                 logger.info(f'Generated text: {generated_text}')
89 | 
90 |         else:
91 |             logger.info(
92 |                 'Pre-trained language model not found in kwargs, skipping.')
93 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/compute_metrics.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Dict, List, Tuple, Union
  3 | 
  4 | import jieba
  5 | import numpy as np
  6 | from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
  7 | from rouge_chinese import Rouge
  8 | from transformers import PreTrainedTokenizer
  9 | 
 10 | 
 11 | @dataclass
 12 | class ComputeMetrics:
 13 |     """
 14 |     Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer.
 15 |     Borrowed from: https://github.com/THUDM/ChatGLM-6B/blob/0c2806fea82683349194e21996dd6b3acc3c265b/ptuning/main.py#L307
 16 | 
 17 |     """
 18 |     def __init__(self, tokenizer: PreTrainedTokenizer) -> None:
 19 |         """
 20 |         Initialize the ComputeMetrics class with a pre-trained tokenizer object.
 21 | 
 22 |         Args:
 23 |             tokenizer (PreTrainedTokenizer): A pre-trained tokenizer object to be used for decoding tokenized sequences.
 24 |         """
 25 |         self.tokenizer = tokenizer
 26 | 
 27 |     def __call__(
 28 |         self, eval_preds: List[Union[np.ndarray, Tuple[np.ndarray]]]
 29 |     ) -> Dict[str, float]:
 30 |         """
 31 |         Computes evaluation metrics for model predictions.
 32 | 
 33 |         Args:
 34 |             eval_preds (List[Union[np.ndarray, Tuple[np.ndarray]]]): List of tuples containing prediction and label arrays.
 35 | 
 36 |         Returns:
 37 |             Dict[str, float]: A dictionary containing the average of each computed metric over all prediction-label pairs.
 38 |         """
 39 | 
 40 |         # Extract predictions and labels from input
 41 |         preds, labels = eval_preds
 42 |         if isinstance(preds, tuple):
 43 |             preds = preds[0]
 44 | 
 45 |         # Replace IGNORE_INDEX in the labels with pad_token_id as we cannot decode them if ignore_pad_token_for_loss=True.
 46 |         preds = np.where(preds != self.tokenizer.pad_token_id, preds,
 47 |                          self.tokenizer.pad_token_id)
 48 |         labels = np.where(labels != self.tokenizer.pad_token_id, labels,
 49 |                           self.tokenizer.pad_token_id)
 50 | 
 51 |         score_dict = {
 52 |             'rouge-1': [],  # numericl 1
 53 |             'rouge-2': [],
 54 |             'rouge-l': [],  # string l
 55 |             'bleu-4': []
 56 |         }
 57 | 
 58 |         # Calculate metrics for each prediction-label pair
 59 |         for pred, label in zip(preds, labels):
 60 |             pred = pred[(pred == self.tokenizer.bos_token_id
 61 |                          ).nonzero()[0][0]:]  # remove the query
 62 |             hypothesis = list(
 63 |                 jieba.cut(self.tokenizer.decode(pred,
 64 |                                                 skip_special_tokens=True)))
 65 |             reference = list(
 66 |                 jieba.cut(
 67 |                     self.tokenizer.decode(label, skip_special_tokens=True)))
 68 | 
 69 |             # If there are no words in the hypothesis, set all scores to 0
 70 |             if len(' '.join(hypothesis).split()) == 0:
 71 |                 result = {
 72 |                     'rouge-1': {
 73 |                         'f': 0.0
 74 |                     },
 75 |                     'rouge-2': {
 76 |                         'f': 0.0
 77 |                     },
 78 |                     'rouge-l': {
 79 |                         'f': 0.0
 80 |                     }
 81 |                 }
 82 |             else:
 83 |                 rouge = Rouge()
 84 |                 scores = rouge.get_scores(' '.join(hypothesis),
 85 |                                           ' '.join(reference))
 86 |                 result = scores[0]
 87 | 
 88 |             # Append scores to score_dict
 89 |             for k, v in result.items():
 90 |                 score_dict[k].append(round(v['f'] * 100, 4))
 91 | 
 92 |             # Calculate BLEU-4 score and append it to score_dict
 93 |             bleu_score = sentence_bleu(
 94 |                 [list(label)],
 95 |                 list(pred),
 96 |                 smoothing_function=SmoothingFunction().method3)
 97 |             score_dict['bleu-4'].append(round(bleu_score * 100, 4))
 98 | 
 99 |         # Calculate average of each metric over all prediction-label pairs and return as a dictionary
100 |         return {k: float(np.mean(v)) for k, v in score_dict.items()}
101 | 


--------------------------------------------------------------------------------
/retrieval_contriever/src/slurm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from logging import getLogger
  8 | import os
  9 | import sys
 10 | import torch
 11 | import socket
 12 | import signal
 13 | import subprocess
 14 | 
 15 | 
 16 | logger = getLogger()
 17 | 
 18 | def sig_handler(signum, frame):
 19 |     logger.warning("Signal handler called with signal " + str(signum))
 20 |     prod_id = int(os.environ['SLURM_PROCID'])
 21 |     logger.warning("Host: %s - Global rank: %i" % (socket.gethostname(), prod_id))
 22 |     if prod_id == 0:
 23 |         logger.warning("Requeuing job " + os.environ['SLURM_JOB_ID'])
 24 |         os.system('scontrol requeue ' + os.environ['SLURM_JOB_ID'])
 25 |     else:
 26 |         logger.warning("Not the main process, no need to requeue.")
 27 |     sys.exit(-1)
 28 | 
 29 | 
 30 | def term_handler(signum, frame):
 31 |     logger.warning("Signal handler called with signal " + str(signum))
 32 |     logger.warning("Bypassing SIGTERM.")
 33 | 
 34 | 
 35 | def init_signal_handler():
 36 |     """
 37 |     Handle signals sent by SLURM for time limit / pre-emption.
 38 |     """
 39 |     signal.signal(signal.SIGUSR1, sig_handler)
 40 |     signal.signal(signal.SIGTERM, term_handler)
 41 | 
 42 | 
 43 | def init_distributed_mode(params):
 44 |     """
 45 |     Handle single and multi-GPU / multi-node / SLURM jobs.
 46 |     Initialize the following variables:
 47 |         - local_rank
 48 |         - global_rank
 49 |         - world_size
 50 |     """
 51 |     is_slurm_job = 'SLURM_JOB_ID' in os.environ and not 'WORLD_SIZE' in os.environ
 52 |     has_local_rank = hasattr(params, 'local_rank')
 53 | 
 54 |     # SLURM job without torch.distributed.launch
 55 |     if is_slurm_job and has_local_rank:
 56 | 
 57 |         assert params.local_rank == -1   # on the cluster, this is handled by SLURM
 58 | 
 59 |         # local rank on the current node / global rank
 60 |         params.local_rank = int(os.environ['SLURM_LOCALID'])
 61 |         params.global_rank = int(os.environ['SLURM_PROCID'])
 62 |         params.world_size = int(os.environ['SLURM_NTASKS'])
 63 | 
 64 |         # define master address and master port
 65 |         hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', os.environ['SLURM_JOB_NODELIST']])
 66 |         params.main_addr = hostnames.split()[0].decode('utf-8')
 67 |         assert 10001 <= params.main_port <= 20000 or params.world_size == 1
 68 | 
 69 |         # set environment variables for 'env://'
 70 |         os.environ['MASTER_ADDR'] = params.main_addr
 71 |         os.environ['MASTER_PORT'] = str(params.main_port)
 72 |         os.environ['WORLD_SIZE'] = str(params.world_size)
 73 |         os.environ['RANK'] = str(params.global_rank)
 74 |         is_distributed = True
 75 | 
 76 | 
 77 |     # multi-GPU job (local or multi-node) - jobs started with torch.distributed.launch
 78 |     elif has_local_rank and params.local_rank != -1:
 79 | 
 80 |         assert params.main_port == -1
 81 | 
 82 |         # read environment variables
 83 |         params.global_rank = int(os.environ['RANK'])
 84 |         params.world_size = int(os.environ['WORLD_SIZE'])
 85 | 
 86 |         is_distributed = True
 87 | 
 88 |     # local job (single GPU)
 89 |     else:
 90 |         params.local_rank = 0
 91 |         params.global_rank = 0
 92 |         params.world_size = 1
 93 |         is_distributed = False
 94 | 
 95 |     # set GPU device
 96 |     torch.cuda.set_device(params.local_rank)
 97 | 
 98 |     # initialize multi-GPU
 99 |     if is_distributed:
100 | 
101 |         # http://pytorch.apachecn.org/en/0.3.0/distributed.html#environment-variable-initialization
102 |         # 'env://' will read these environment variables:
103 |         # MASTER_PORT - required; has to be a free port on machine with rank 0
104 |         # MASTER_ADDR - required (except for rank 0); address of rank 0 node
105 |         # WORLD_SIZE - required; can be set either here, or in a call to init function
106 |         # RANK - required; can be set either here, or in a call to init function
107 | 
108 |         #print("Initializing PyTorch distributed ...")
109 |         torch.distributed.init_process_group(
110 |             init_method='env://',
111 |             backend='nccl',
112 |             #world_size=params.world_size,
113 |             #rank=params.global_rank,
114 |         )


--------------------------------------------------------------------------------
/source/model/llama2/train_qlora.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | 
  5 | import torch
  6 | import transformers
  7 | from transformers import GenerationConfig, Trainer, set_seed
  8 | 
  9 | from chatllms.configs import (DataArguments, GenerationArguments,
 10 |                               LoraArguments, ModelArguments, QuantArguments,
 11 |                               TrainingArguments)
 12 | from chatllms.data import make_supervised_data_module
 13 | from chatllms.model import (MMLUEvalCallback, SampleGenerateCallback,
 14 |                             SavePeftModelCallback, load_model_tokenizer)
 15 | from chatllms.train.training import train_and_evaluate
 16 | from chatllms.utils.logger_utils import get_root_logger
 17 | from chatllms.utils.model_utils import (check_training_finished,
 18 |                                         print_trainable_parameters,
 19 |                                         verify_dtypes)
 20 | 
 21 | torch.backends.cuda.matmul.allow_tf32 = True
 22 | 
 23 | 
 24 | def main():
 25 |     parser = transformers.HfArgumentParser(
 26 |         (ModelArguments, DataArguments, TrainingArguments, LoraArguments,
 27 |          QuantArguments, GenerationArguments))
 28 |     (model_args, data_args, training_args, lora_args, quant_args,
 29 |      generation_args) = parser.parse_args_into_dataclasses()
 30 |     # Check arguments (do not check finetuning_args since it may be loaded from checkpoints)
 31 |     data_args.init_for_training()
 32 |     training_args.generation_config = GenerationConfig(**vars(generation_args))
 33 | 
 34 |     args = argparse.Namespace(**vars(model_args), **vars(data_args),
 35 |                               **vars(training_args), **vars(lora_args),
 36 |                               **vars(quant_args))
 37 |     # init the logger before other steps
 38 |     timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
 39 |     if not os.path.exists(args.output_dir):
 40 |         os.makedirs(args.output_dir)
 41 |     log_file = os.path.join(args.output_dir, f'{timestamp}.log')
 42 |     logger = get_root_logger(log_file=log_file, log_level='INFO')
 43 | 
 44 |     # Log on each process the small summary:
 45 |     logger.info(
 46 |         f'Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}'
 47 |         +
 48 |         f'distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}'
 49 |     )
 50 |     logger.info('Training/evaluation parameters %s', args)
 51 |     # Check if training was already completed.
 52 |     checkpoint_dir, completed_training = check_training_finished(args, logger)
 53 |     args.resume_checkpoint = checkpoint_dir
 54 | 
 55 |     # load model and tokenizer
 56 |     model, tokenizer = load_model_tokenizer(
 57 |         args=args,
 58 |         checkpoint_dir=checkpoint_dir,
 59 |         is_trainable=args.do_train,
 60 |         logger=logger,
 61 |     )
 62 |     logger.info('Loaded model...')
 63 | 
 64 |     logger.info('Printing trainable parameters...')
 65 |     print_trainable_parameters(args, model)
 66 | 
 67 |     set_seed(args.seed)
 68 | 
 69 |     # Verify dtypes
 70 |     logger.info('Verifying dtypes...')
 71 |     verify_dtypes(model)
 72 | 
 73 |     data_module = make_supervised_data_module(tokenizer=tokenizer, args=args)
 74 |     trainer = Trainer(model=model,
 75 |                       tokenizer=tokenizer,
 76 |                       args=training_args,
 77 |                       **data_module)
 78 |     # Add callback to save adapter model.
 79 |     if not args.full_finetune:
 80 |         trainer.add_callback(SavePeftModelCallback)
 81 | 
 82 |     # Add callback to generate samples.
 83 |     if args.sample_generate:
 84 |         trainer.add_callback(
 85 |             SampleGenerateCallback(
 86 |                 tokenizer=tokenizer,
 87 |                 generation_config=GenerationConfig(**vars(generation_args)),
 88 |                 logger=logger,
 89 |             ))
 90 | 
 91 |     if args.do_mmlu_eval:
 92 |         eval_callback = MMLUEvalCallback(
 93 |             trainer=trainer,
 94 |             tokenizer=tokenizer,
 95 |             data_dir='./data',
 96 |             args=args,
 97 |         )
 98 |         trainer.add_callback(eval_callback)
 99 | 
100 |     assert args.do_train or args.do_eval or args.do_predict
101 |     if args.do_train or args.do_eval:
102 |         train_and_evaluate(trainer, args, logger)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/save_peft_model_callback.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, Dict
 3 | 
 4 | from transformers import (PreTrainedModel, TrainerCallback, TrainerControl,
 5 |                           TrainingArguments)
 6 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 7 | 
 8 | 
 9 | class SavePeftModelCallback(TrainerCallback):
10 |     """
11 |     Callback to save PEFT model checkpoints during training.
12 | 
13 |     Saves both the full model and the adapter model to separate directories
14 |     within the checkpoint directory.
15 |     """
16 |     def save_model(self, args: Any, state: TrainingArguments,
17 |                    kwargs: Dict[str, Any]) -> None:
18 |         """
19 |         Saves the PEFT model checkpoint.
20 | 
21 |         Args:
22 |             args (Any): The command line arguments passed to the script.
23 |             state (TrainingArguments): The current state of training.
24 |             kwargs (Dict[str, Any]): A dictionary of additional keyword arguments.
25 | 
26 |         Raises:
27 |             TypeError: If `state` is not an instance of `TrainingArguments`.
28 |         """
29 |         print('+' * 20, 'Saving PEFT Model Checkpoint CallBack', '+' * 20)
30 | 
31 |         # Get the checkpoint directory for saving models.
32 |         if state.best_model_checkpoint is not None:
33 |             # If best model checkpoint exists, use its directory as the checkpoint folder
34 |             checkpoint_dir = os.path.join(state.best_model_checkpoint,
35 |                                           'adapter_model')
36 |         else:
37 |             # Otherwise, create a new checkpoint folder using the output directory and current global step
38 |             checkpoint_dir = os.path.join(
39 |                 args.output_dir,
40 |                 f'{PREFIX_CHECKPOINT_DIR}-{state.global_step}')
41 | 
42 |         # Create path for the PEFT model
43 |         peft_model_path = os.path.join(checkpoint_dir, 'adapter_model')
44 |         model: PreTrainedModel = kwargs['model']
45 |         model.save_pretrained(peft_model_path)
46 | 
47 |         # Create path for the PyTorch model binary file and remove it if it already exists
48 |         pytorch_model_path = os.path.join(checkpoint_dir, 'pytorch_model.bin')
49 |         if os.path.exists(pytorch_model_path):
50 |             os.remove(pytorch_model_path)
51 | 
52 |     def on_save(self, args: Any, state: TrainingArguments,
53 |                 control: TrainerControl,
54 |                 **kwargs: Dict[str, Any]) -> TrainerControl:
55 |         """
56 |         Callback method that calls save_model() and returns `control` argument.
57 | 
58 |         Args:
59 |             args (Any): The command line arguments passed to the script.
60 |             state (TrainingArguments): The current state of training.
61 |             control (trainer_callback.TrainerControl): \
62 |                 The current state of the TrainerCallback's control flow.
63 |             kwargs (Dict[str, Any]): A dictionary of additional keyword arguments.
64 | 
65 |         Returns:
66 |             trainer_callback.TrainerControl: The current state of the TrainerCallback's control flow.
67 | 
68 |         Raises:
69 |             TypeError: If `state` is not an instance of `TrainingArguments`.
70 |         """
71 |         self.save_model(args, state, kwargs)
72 |         return control
73 | 
74 |     def on_train_end(self, args: Any, state: TrainingArguments,
75 |                      control: TrainerControl, **kwargs: Dict[str,
76 |                                                              Any]) -> None:
77 |         """
78 |         Callback method that saves the model checkpoint and creates a 'completed' file in the output directory.
79 | 
80 |         Args:
81 |             args (Any): The command line arguments passed to the script.
82 |             state (TrainingArguments): The current state of training.
83 |             control (trainer_callback.TrainerControl): \
84 |                 The current state of the TrainerCallback's control flow.
85 |             kwargs (Dict[str, Any]): A dictionary of additional keyword arguments.
86 | 
87 |         Raises:
88 |             TypeError: If `state` is not an instance of `TrainingArguments`.
89 |         """
90 | 
91 |         # Define a helper function to create a 'completed' file in the output directory
92 |         def touch(fname, times=None):
93 |             with open(fname, 'a'):
94 |                 os.utime(fname, times)
95 | 
96 |         # Create the 'completed' file in the output directory
97 |         touch(os.path.join(args.output_dir, 'completed'))
98 | 


--------------------------------------------------------------------------------
/source/model/llama2/server/single_chat.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import sys
  4 | from threading import Thread
  5 | from typing import List
  6 | 
  7 | import torch
  8 | import transformers
  9 | from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
 10 |                           PreTrainedTokenizer, TextIteratorStreamer)
 11 | 
 12 | sys.path.append('../')
 13 | from chatllms.configs import GenerationArguments, ModelInferenceArguments
 14 | from chatllms.utils.model_utils import get_logits_processor
 15 | 
 16 | 
 17 | def generate_response(query: str, tokenizer: PreTrainedTokenizer,
 18 |                       model: PreTrainedModel,
 19 |                       generation_args: dict) -> List[str]:
 20 |     """
 21 |     Generates a response to the given query using GPT-3.5 model and prints it to the console.
 22 | 
 23 |     Args:
 24 |         query (str): The input query for which a response is to be generated.
 25 |         tokenizer (PreTrainedTokenizer): The tokenizer used to convert the raw text into input tokens.
 26 |         model (PreTrainedModel): The GPT-3.5 model used to generate the response.
 27 |         generation_args (dict): A dictionary containing the arguments to be passed to the generate() method of the model.
 28 | 
 29 |     Returns:
 30 |         List[Tuple[str, str]]: A list of all the previous queries and their responses, including the current one.
 31 |     """
 32 | 
 33 |     # Convert the query and history into input IDs
 34 |     inputs = tokenizer(query, return_tensors='pt', add_special_tokens=False)
 35 |     inputs = {k: v.to(model.device) for k, v in inputs.items()}
 36 | 
 37 |     # Create a TextIteratorStreamer object to stream the response from the model
 38 |     streamer = TextIteratorStreamer(tokenizer,
 39 |                                     timeout=60.0,
 40 |                                     skip_prompt=True,
 41 |                                     skip_special_tokens=True)
 42 | 
 43 |     # Set the arguments for the model's generate() method
 44 |     gen_kwargs = dict(
 45 |         **inputs,
 46 |         streamer=streamer,
 47 |         logits_processor=get_logits_processor(),
 48 |         **generation_args.to_dict(),
 49 |     )
 50 | 
 51 |     # Start a separate thread to generate the response asynchronously
 52 |     thread = Thread(target=model.generate, kwargs=gen_kwargs)
 53 |     thread.start()
 54 | 
 55 |     # Print the model name and the response as it is generated
 56 |     print('Assistant: ', end='', flush=True)
 57 |     response = ''
 58 |     for new_text in streamer:
 59 |         print(new_text, end='', flush=True)
 60 |         response += new_text
 61 |     # Update the history with the current query and response and return it
 62 |     return response
 63 | 
 64 | 
 65 | def main():
 66 |     """
 67 |     单轮对话，不具有对话历史的记忆功能
 68 |     Run conversational agent loop with input/output.
 69 | 
 70 |     Args:
 71 |         model_args: Arguments for loading model
 72 |         gen_args: Arguments for model.generate()
 73 | 
 74 |     Returns:
 75 |         None
 76 |     """
 77 | 
 78 |     # Parse command-line arguments
 79 |     parser = transformers.HfArgumentParser(
 80 |         (ModelInferenceArguments, GenerationArguments))
 81 |     model_server_args, generation_args = parser.parse_args_into_dataclasses()
 82 | 
 83 |     # Load the pretrained language model.
 84 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 85 | 
 86 |     model = AutoModelForCausalLM.from_pretrained(
 87 |         model_server_args.model_name_or_path,
 88 |         trust_remote_code=True,
 89 |         low_cpu_mem_usage=True,
 90 |         torch_dtype=torch.float16,
 91 |         device_map='auto').to(device).eval()
 92 | 
 93 |     tokenizer = AutoTokenizer.from_pretrained(
 94 |         model_server_args.model_name_or_path,
 95 |         trust_remote_code=True,
 96 |         use_fast=False,
 97 |     )
 98 | 
 99 |     os_name = platform.system()
100 |     clear_command = 'cls' if os_name == 'Windows' else 'clear'
101 |     # Set the arguments for the model's generate() method
102 |     print('欢迎使用 CLI 对话系统，输入内容即可对话，clear 清空对话历史，stop 终止程序')
103 |     input_pattern = '<s>{}</s>'
104 |     while True:
105 |         query = input('\nUser: ')
106 |         if query.strip() == 'stop':
107 |             break
108 | 
109 |         if query.strip() == 'clear':
110 |             os.system(clear_command)
111 |             print('History has been removed.')
112 |             print('欢迎使用CLI 对话系统，输入内容即可对话，clear 清空对话历史，stop 终止程序')
113 |             continue
114 | 
115 |         query = input_pattern.format(query)
116 |         # Perform prediction and printing
117 |         generate_response(query, tokenizer, model, generation_args)
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/source/model/llama2/server/gradio_base_webserver.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import gradio as gr
  4 | import torch
  5 | from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
  6 | 
  7 | from chatllms.utils.apply_lora import apply_lora
  8 | 
  9 | 
 10 | def args_parser():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('--model_name_or_path',
 13 |                         default=None,
 14 |                         type=str,
 15 |                         required=True,
 16 |                         help='Path to pre-trained model')
 17 |     parser.add_argument('--lora_model_name_or_path',
 18 |                         default=None,
 19 |                         type=str,
 20 |                         help='Path to pre-trained model')
 21 |     parser.add_argument('--no_cuda',
 22 |                         action='store_true',
 23 |                         help='Avoid using CUDA when available')
 24 |     parser.add_argument('--load_8bit',
 25 |                         action='store_true',
 26 |                         help='Whether to use load_8bit  instead of 32-bit')
 27 |     args = parser.parse_args()
 28 | 
 29 |     args.device = torch.device(
 30 |         'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
 31 |     return args
 32 | 
 33 | 
 34 | def main(args):
 35 |     if args.lora_model_name_or_path is not None:
 36 |         model, tokenizer = apply_lora(args.model_name_or_path,
 37 |                                       args.lora_model_name_or_path,
 38 |                                       load_8bit=args.load_8bit)
 39 |     else:
 40 |         tokenizer = AutoTokenizer.from_pretrained(
 41 |             pretrained_model_name_or_path=args.model_name_or_path,
 42 |             trust_remote_code=True)
 43 |         model = AutoModelForCausalLM.from_pretrained(
 44 |             pretrained_model_name_or_path=args.model_name_or_path,
 45 |             load_in_8bit=args.load_8bit,
 46 |             torch_dtype=torch.float16,
 47 |             device_map='auto',
 48 |             trust_remote_code=True)
 49 | 
 50 |     def evaluate(
 51 |         input=None,
 52 |         temperature=0.8,
 53 |         top_p=0.75,
 54 |         top_k=40,
 55 |         max_new_tokens=128,
 56 |         **kwargs,
 57 |     ):
 58 |         inputs = tokenizer(input, return_tensors='pt')
 59 |         inputs = inputs.to(args.device)
 60 |         generation_config = GenerationConfig(
 61 |             temperature=temperature,
 62 |             top_p=top_p,
 63 |             top_k=top_k,
 64 |             do_sample=True,
 65 |             no_repeat_ngram_size=6,
 66 |             repetition_penalty=1.8,
 67 |             **kwargs,
 68 |         )
 69 |         # Without streaming
 70 |         with torch.no_grad():
 71 |             generation_output = model.generate(
 72 |                 **inputs,
 73 |                 generation_config=generation_config,
 74 |                 return_dict_in_generate=True,
 75 |                 output_scores=True,
 76 |                 max_new_tokens=max_new_tokens,
 77 |             )
 78 |         s = generation_output.sequences[0]
 79 |         output = tokenizer.decode(s, skip_special_tokens=True)
 80 |         yield output
 81 | 
 82 |     description = 'Baichuan7B is a 7B-parameter LLaMA model finetuned to follow instructions.'
 83 |     server = gr.Interface(
 84 |         fn=evaluate,
 85 |         inputs=[
 86 |             gr.components.Textbox(lines=2, label='Input', placeholder='none'),
 87 |             gr.components.Slider(minimum=0,
 88 |                                  maximum=1,
 89 |                                  value=0.1,
 90 |                                  label='Temperature'),
 91 |             gr.components.Slider(minimum=0,
 92 |                                  maximum=1,
 93 |                                  value=0.75,
 94 |                                  label='Top p'),
 95 |             gr.components.Slider(minimum=0,
 96 |                                  maximum=100,
 97 |                                  step=1,
 98 |                                  value=40,
 99 |                                  label='Top k'),
100 |             gr.components.Slider(minimum=1,
101 |                                  maximum=2000,
102 |                                  step=1,
103 |                                  value=128,
104 |                                  label='Max tokens'),
105 |         ],
106 |         outputs=[gr.inputs.Textbox(
107 |             lines=5,
108 |             label='Output',
109 |         )],
110 |         title='Baichuan7B',
111 |         description=description,
112 |     )
113 | 
114 |     server.queue().launch(server_name='0.0.0.0', share=False)
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     args = args_parser()
119 |     main(args)
120 | 


--------------------------------------------------------------------------------
/source/model/llama2/data/dataset_info.yaml:
--------------------------------------------------------------------------------
  1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
  2 | alpaca:
  3 |   hf_hub_url: tatsu-lab/alpaca
  4 |   local_path: tatsu-lab/alpaca/alpaca.json
  5 |   dataset_format: alpaca
  6 |   multi_turn: False
  7 | 
  8 | alpaca-clean:
  9 |   hf_hub_url: yahma/alpaca-cleaned
 10 |   local_path: ''
 11 |   dataset_format: alpaca
 12 |   multi_turn: False
 13 | 
 14 | coig:
 15 |   hf_hub_url: BAAI/COIG
 16 |   local_path: /home/robin/prompt_data//COIG/train_alpaca.json
 17 |   dataset_format: alpaca
 18 |   multi_turn: False
 19 | 
 20 | dolly-15k:
 21 |   hf_hub_url: databricks/databricks-dolly-15k
 22 |   local_path: databricks/databricks-dolly-15k
 23 |   dataset_format: dolly
 24 |   multi_turn: False
 25 | 
 26 | cvalues_comparison_train:
 27 |   hf_hub_url: ''
 28 |   local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json
 29 |   dataset_format: alpaca
 30 |   multi_turn: False
 31 | 
 32 | cvalues_comparison_test:
 33 |   hf_hub_url: ''
 34 |   local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json
 35 |   dataset_format: alpaca
 36 |   multi_turn: False
 37 | 
 38 | guanaco:
 39 |   hf_hub_url: JosephusCheung/GuanacoDataset
 40 |   local_path: ''
 41 |   dataset_format: guanaco
 42 |   multi_turn: False
 43 | 
 44 | hh-rlhf:
 45 |   hf_hub_url: Anthropic/hh-rlhf
 46 |   local_path: ''
 47 |   dataset_format: hh-rlhf
 48 |   multi_turn: False
 49 | 
 50 | huatuogpt:
 51 |   hf_hub_url: FreedomIntelligence/HuatuoGPT-sft-data-v1
 52 |   local_path: /home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.jsonl
 53 |   dataset_format: alpaca
 54 |   multi_turn: False
 55 | 
 56 | openassistant-guanaco:
 57 |   hf_hub_url: timdettmers/openassistant-guanaco
 58 |   local_path: /home/robin/prompt_data/timdettmers/openassistant-guanaco
 59 |   dataset_format: alpaca
 60 |   multi_turn: False
 61 | 
 62 | olcc:
 63 |   hf_hub_url: ''
 64 |   local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json
 65 |   dataset_format: alpaca
 66 |   multi_turn: False
 67 | 
 68 | 100PoisonMpts:
 69 |   hf_hub_url: 'damo/100PoisonMpts'
 70 |   local_path: /home/robin/prompt_data/100PoisonMpts/train.jsonl
 71 |   dataset_format: 100PoisonMpts
 72 |   multi_turn: False
 73 | 
 74 | safety_prompt_part1:
 75 |   hf_hub_url: ''
 76 |   local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json
 77 |   dataset_format: alpaca
 78 |   multi_turn: False
 79 | 
 80 | safety_prompt_part2:
 81 |   hf_hub_url: ''
 82 |   local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json
 83 |   dataset_format: alpaca
 84 |   multi_turn: False
 85 | 
 86 | # Belle Group
 87 | belle_0.5m:
 88 |   hf_hub_url: BelleGroup/train_0.5M_CN
 89 |   local_path: ''
 90 |   dataset_format: alpaca
 91 |   multi_turn: False
 92 | 
 93 | belle_1m:
 94 |   hf_hub_url: BelleGroup/train_1M_CN
 95 |   local_path: ''
 96 |   dataset_format: alpaca
 97 |   multi_turn: False
 98 | 
 99 | belle_2m:
100 |   hf_hub_url: BelleGroup/train_2M_CN
101 |   local_path: ''
102 |   dataset_format: alpaca
103 |   multi_turn: False
104 | 
105 | belle_dialog:
106 |   hf_hub_url: BelleGroup/generated_chat_0.4M
107 |   local_path: ''
108 |   dataset_format: belle_dialog
109 |   multi_turn: False
110 | 
111 | belle_math:
112 |   hf_hub_url: BelleGroup/school_math_0.25M
113 |   local_path: ''
114 |   dataset_format: alpaca
115 |   multi_turn: False
116 | 
117 | belle_multiturn:
118 |   hf_hub_url: BelleGroup/multi_turn_0.5M
119 |   local_path: ''
120 |   dataset_format: belle_multiturn
121 |   multi_turn: True
122 |   columns:
123 |     prompt: instruction
124 |     query: ''
125 |     response: output
126 |     history: history
127 | 
128 | # firefly
129 | firefly:
130 |   hf_hub_url: YeungNLP/firefly-train-1.1M
131 |   local_path: ''
132 |   dataset_format: alpaca
133 |   multi_turn: False
134 |   columns:
135 |     prompt: input
136 |     query: ''
137 |     response: target
138 |     history: ''
139 | 
140 | # CodeAlpaca
141 | codealpaca:
142 |   hf_hub_url: sahil2801/CodeAlpaca-20k
143 |   local_path: ''
144 |   dataset_format: codealpaca
145 |   multi_turn: False
146 | 
147 | # alpacacot
148 | alpaca_cot:
149 |   hf_hub_url: QingyiSi/Alpaca-CoT
150 |   local_path: ''
151 |   multi_turn: False
152 | 
153 | webqa:
154 |   hf_hub_url: suolyer/webqa
155 |   local_path: ''
156 |   dataset_format: webqa
157 |   multi_turn: False
158 |   columns:
159 |     prompt: input
160 |     query: ''
161 |     response: output
162 |     history: ''
163 | 
164 | # mutli-turn datasets
165 | evol_instruct:
166 |   hf_hub_url: WizardLM/WizardLM_evol_instruct_V2_196k
167 |   local_path: WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json
168 |   dataset_format: sharegpt
169 |   multi_turn: True
170 | 
171 | share_gpt:
172 |   hf_hub_url: ''
173 |   local_path: /home/robin/prompt_data/sharegpt/sharegpt_split.json
174 |   dataset_format: sharegpt
175 |   multi_turn: True
176 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/train/training.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import math
  4 | import os
  5 | from typing import Any, Dict
  6 | 
  7 | import numpy as np
  8 | import transformers
  9 | from torch.utils.data import Dataset
 10 | 
 11 | 
 12 | def train_and_evaluate(trainer: transformers.Trainer, args: argparse.Namespace,
 13 |                        logger: None) -> None:
 14 |     """
 15 |     Trains and evaluates a machine learning model.
 16 | 
 17 |     Args:
 18 |         trainer (Trainer): The training object to use for training and evaluation.
 19 |         args (argparse.Namespace): The command line arguments for the current run.
 20 |     Returns:
 21 |         None
 22 |     """
 23 |     # Create dictionary to store metrics
 24 |     all_metrics: Dict[str, Any] = {'run_name': args.run_name}
 25 | 
 26 |     # Training
 27 |     if args.do_train:
 28 |         logger.info('=' * 80)
 29 |         logger.info('*** Train ***')
 30 |         logger.info('=' * 80)
 31 |         train_result = trainer.train(
 32 |             resume_from_checkpoint=args.resume_checkpoint)
 33 |         metrics = train_result.metrics
 34 | 
 35 |         metrics['train_samples'] = len(trainer.train_dataset)
 36 | 
 37 |         # Log and save training metrics
 38 |         trainer.log_metrics('train', metrics)
 39 |         trainer.save_metrics('train', metrics)
 40 |         trainer.save_state()
 41 | 
 42 |         # Update metrics dictionary with training metrics
 43 |         all_metrics.update(metrics)
 44 | 
 45 |     # Evaluation
 46 |     if args.do_eval:
 47 |         logger.info('=' * 80)
 48 |         logger.info('*** Evaluate ***')
 49 |         logger.info('=' * 80)
 50 | 
 51 |         # Evaluate the trained model and obtain evaluation metrics
 52 |         metrics = trainer.evaluate(metric_key_prefix='eval')
 53 | 
 54 |         try:
 55 |             perplexity = math.exp(metrics['eval_loss'])
 56 |         except OverflowError:
 57 |             perplexity = float('inf')
 58 | 
 59 |         metrics['perplexity'] = perplexity
 60 |         metrics['eval_samples'] = len(trainer.eval_dataset)
 61 |         # Log and save evaluation metrics
 62 |         trainer.log_metrics('eval', metrics)
 63 |         trainer.save_metrics('eval', metrics)
 64 | 
 65 |         # Update metrics dictionary with evaluation metrics
 66 |         all_metrics.update(metrics)
 67 | 
 68 |     # Save all metrics to a json file
 69 |     if args.do_train or args.do_eval:
 70 |         with open(os.path.join(args.output_dir, 'metrics.json'), 'w') as fout:
 71 |             fout.write(json.dumps(all_metrics))
 72 | 
 73 | 
 74 | def predict_and_save(trainer: transformers.Trainer,
 75 |                      tokenizer: transformers.PreTrainedTokenizer,
 76 |                      predict_dataset: Dataset, args: argparse.Namespace,
 77 |                      logger: None) -> None:
 78 |     """
 79 |     Make predictions on new data, save them to a file along with input examples,
 80 |     and update the overall metrics.
 81 |     """
 82 |     logger.info('=' * 80)
 83 |     logger.info('*** Predict ***')
 84 |     logger.info('=' * 80)
 85 |     data_dict = predict_dataset.dataset
 86 | 
 87 |     # Make predictions on the test dataset
 88 |     prediction_output = trainer.predict(test_dataset=predict_dataset,
 89 |                                         metric_key_prefix='predict')
 90 | 
 91 |     # Get the predictions and metrics
 92 |     prediction_metrics = prediction_output.metrics
 93 |     predictions = prediction_output.predictions
 94 | 
 95 |     # Replace -100 values with pad token ID and decode predictions
 96 |     predictions = np.where(predictions != -100, predictions,
 97 |                            tokenizer.pad_token_id)
 98 |     predictions = tokenizer.batch_decode(predictions,
 99 |                                          skip_special_tokens=True,
100 |                                          clean_up_tokenization_spaces=True)
101 | 
102 |     data_dict = predict_dataset.dataset
103 |     # Create dictionary to store metrics
104 |     all_metrics: Dict[str, Any] = {'run_name': args.run_name}
105 |     # Write predictions and input examples to file
106 |     with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
107 |         for i, example in enumerate(data_dict):
108 |             example['prediction_with_input'] = predictions[i].strip()
109 |             example['prediction'] = predictions[i].replace(
110 |                 example['input'], '').strip()
111 |             fout.write(json.dumps(example) + '\n')
112 | 
113 |     # Print and log the prediction metrics
114 |     print(prediction_metrics)
115 |     trainer.log_metrics('predict', prediction_metrics)
116 |     trainer.save_metrics('predict', prediction_metrics)
117 | 
118 |     # Update the overall metrics
119 |     all_metrics.update(prediction_metrics)
120 | 
121 |     # Save the overall metrics to a file
122 |     with open(os.path.join(args.output_dir, 'eval_metrics.json'), 'w') as fout:
123 |         fout.write(json.dumps(all_metrics))
124 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/data_args.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass, field
  3 | from typing import List, Optional
  4 | 
  5 | import yaml
  6 | 
  7 | 
  8 | @dataclass
  9 | class DatasetAttr(object):
 10 | 
 11 |     dataset_name: Optional[str] = None
 12 |     hf_hub_url: Optional[str] = None
 13 |     local_path: Optional[str] = None
 14 |     dataset_format: Optional[str] = None
 15 |     load_from_local: bool = False
 16 |     multi_turn: Optional[bool] = False
 17 | 
 18 |     def __repr__(self) -> str:
 19 |         rep = (f'dataset_name: {self.dataset_name} || '
 20 |                f'hf_hub_url: {self.hf_hub_url} || '
 21 |                f'local_path: {self.local_path} \n'
 22 |                f'data_formate: {self.dataset_format}  || '
 23 |                f'load_from_local: {self.load_from_local} || '
 24 |                f'multi_turn: {self.multi_turn}')
 25 |         return rep
 26 | 
 27 |     def __post_init__(self):
 28 |         self.prompt_column = 'instruction'
 29 |         self.query_column = 'input'
 30 |         self.response_column = 'output'
 31 |         self.history_column = None
 32 | 
 33 | 
 34 | @dataclass
 35 | class DataArguments:
 36 |     dataset_cfg: Optional[str] = field(
 37 |         default='./data/alpaca_zh.yaml',
 38 |         metadata={
 39 |             'help':
 40 |             'Path to dataset infos, please refer to `./data/README.md` to see how to prepare your datasets for training.'
 41 |         })
 42 |     instruction_template: str = field(
 43 |         default='default',
 44 |         metadata={
 45 |             'help':
 46 |             'Which template to use for constructing prompts in training and inference.'
 47 |         })
 48 |     conversation_template: str = field(
 49 |         default='default',
 50 |         metadata={
 51 |             'help':
 52 |             'Which template to use for constructing prompts in multi-turn dataset training and inference.'
 53 |         })
 54 |     # 验证数据集的尺寸，也就是数量
 55 |     eval_dataset_size: Optional[float] = field(
 56 |         default=0.1, metadata={'help': 'Size of validation dataset.'})
 57 |     # 最大训练数据样本的数量。主要是为了快速调试训练代码
 58 |     max_train_samples: Optional[int] = field(
 59 |         default=None,
 60 |         metadata={
 61 |             'help':
 62 |             'For debugging purposes or quicker training, truncate the number of training examples to this '
 63 |             'value if set.'
 64 |         },
 65 |     )
 66 |     # 与max_train_samples类似，主要是为了快速调试训练代码
 67 |     max_eval_samples: Optional[int] = field(
 68 |         default=None,
 69 |         metadata={
 70 |             'help':
 71 |             'For debugging purposes or quicker training, truncate the number of evaluation examples to this '
 72 |             'value if set.'
 73 |         },
 74 |     )
 75 | 
 76 |     def init_for_training(self):  # support mixing multiple datasets
 77 |         assert self.dataset_cfg is not None and os.path.exists(
 78 |             self.dataset_cfg
 79 |         ), f'{self.dataset_cfg} does not exist!, please check the path.'
 80 |         datasets_info = yaml.safe_load(open(self.dataset_cfg, 'r'))
 81 |         self.dataset_names = list(datasets_info.keys())
 82 |         self.dataset_attr_list: List[DatasetAttr] = []
 83 |         for i, name in enumerate(self.dataset_names):
 84 |             dataset_attr = DatasetAttr()
 85 |             dataset_attr.dataset_name = name
 86 |             dataset_attr.dataset_format = datasets_info[name].get(
 87 |                 'dataset_format', None)
 88 |             dataset_attr.hf_hub_url = datasets_info[name].get(
 89 |                 'hf_hub_url', None)
 90 |             dataset_attr.local_path = datasets_info[name].get(
 91 |                 'local_path', None)
 92 |             dataset_attr.multi_turn = datasets_info[name].get(
 93 |                 'multi_turn', False)
 94 | 
 95 |             if datasets_info[name]['local_path'] and os.path.exists(
 96 |                     datasets_info[name]['local_path']):
 97 |                 dataset_attr.load_from_local = True
 98 |             else:
 99 |                 dataset_attr.load_from_local = False
100 |                 raise Warning(
101 |                     'You have set local_path: {} for {} but it does not exist! Will load the data from {}'
102 |                     .format(name, dataset_attr.local_path,
103 |                             dataset_attr.hf_hub_url))
104 | 
105 |             if 'columns' in datasets_info[name]:
106 |                 dataset_attr.prompt_column = datasets_info[name][
107 |                     'columns'].get('prompt', None)
108 |                 dataset_attr.query_column = datasets_info[name]['columns'].get(
109 |                     'query', None)
110 |                 dataset_attr.response_column = datasets_info[name][
111 |                     'columns'].get('response', None)
112 |                 dataset_attr.history_column = datasets_info[name][
113 |                     'columns'].get('history', None)
114 | 
115 |             self.dataset_attr_list.append(dataset_attr)
116 | 


--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/logger_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import torch.distributed as dist
  4 | 
  5 | logger_initialized: dict = {}
  6 | 
  7 | 
  8 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
  9 |     """Initialize and get a logger by name.
 10 | 
 11 |     If the logger has not been initialized, this method will initialize the
 12 |     logger by adding one or two handlers, otherwise the initialized logger will
 13 |     be directly returned. During initialization, a StreamHandler will always be
 14 |     added. If `log_file` is specified and the process rank is 0, a FileHandler
 15 |     will also be added.
 16 | 
 17 |     Args:
 18 |         name (str): Logger name.
 19 |         log_file (str | None): The log filename. If specified, a FileHandler
 20 |             will be added to the logger.
 21 |         log_level (int): The logger level. Note that only the process of
 22 |             rank 0 is affected, and other processes will set the level to
 23 |             "Error" thus be silent most of the time.
 24 |         file_mode (str): The file mode used in opening log file.
 25 |             Defaults to 'w'.
 26 | 
 27 |     Returns:
 28 |         logging.Logger: The expected logger.
 29 |     """
 30 |     logger = logging.getLogger(name)
 31 |     if name in logger_initialized:
 32 |         return logger
 33 |     # handle hierarchical names
 34 |     # e.g., logger "a" is initialized, then logger "a.b" will skip the
 35 |     # initialization since it is a child of "a".
 36 |     for logger_name in logger_initialized:
 37 |         if name.startswith(logger_name):
 38 |             return logger
 39 | 
 40 |     # handle duplicate logs to the console
 41 |     # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
 42 |     # to the root logger. As logger.propagate is True by default, this root
 43 |     # level handler causes logging messages from rank>0 processes to
 44 |     # unexpectedly show up on the console, creating much unwanted clutter.
 45 |     # To fix this issue, we set the root logger's StreamHandler, if any, to log
 46 |     # at the ERROR level.
 47 |     for handler in logger.root.handlers:
 48 |         if type(handler) is logging.StreamHandler:
 49 |             handler.setLevel(logging.ERROR)
 50 | 
 51 |     stream_handler = logging.StreamHandler()
 52 |     handlers = [stream_handler]
 53 | 
 54 |     if dist.is_available() and dist.is_initialized():
 55 |         rank = dist.get_rank()
 56 |     else:
 57 |         rank = 0
 58 | 
 59 |     # only rank 0 will add a FileHandler
 60 |     if rank == 0 and log_file is not None:
 61 |         # Here, the default behaviour of the official logger is 'a'. Thus, we
 62 |         # provide an interface to change the file mode to the default
 63 |         # behaviour.
 64 |         file_handler = logging.FileHandler(log_file, file_mode)
 65 |         handlers.append(file_handler)
 66 | 
 67 |     formatter = logging.Formatter(
 68 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 69 |     for handler in handlers:
 70 |         handler.setFormatter(formatter)
 71 |         handler.setLevel(log_level)
 72 |         logger.addHandler(handler)
 73 | 
 74 |     if rank == 0:
 75 |         logger.setLevel(log_level)
 76 |     else:
 77 |         logger.setLevel(logging.ERROR)
 78 | 
 79 |     logger_initialized[name] = True
 80 | 
 81 |     return logger
 82 | 
 83 | 
 84 | def print_log(msg, logger=None, level=logging.INFO):
 85 |     """Print a log message.
 86 | 
 87 |     Args:
 88 |         msg (str): The message to be logged.
 89 |         logger (logging.Logger | str | None): The logger to be used.
 90 |             Some special loggers are:
 91 | 
 92 |             - "silent": no message will be printed.
 93 |             - other str: the logger obtained with `get_root_logger(logger)`.
 94 |             - None: The `print()` method will be used to print log messages.
 95 |         level (int): Logging level. Only available when `logger` is a Logger
 96 |             object or "root".
 97 |     """
 98 |     if logger is None:
 99 |         print(msg)
100 |     elif isinstance(logger, logging.Logger):
101 |         logger.log(level, msg)
102 |     elif logger == 'silent':
103 |         pass
104 |     elif isinstance(logger, str):
105 |         _logger = get_logger(logger)
106 |         _logger.log(level, msg)
107 |     else:
108 |         raise TypeError(
109 |             'logger should be either a logging.Logger object, str, '
110 |             f'"silent" or None, but got {type(logger)}')
111 | 
112 | 
113 | def get_root_logger(log_file=None, log_level=logging.INFO):
114 |     """Get root logger.
115 | 
116 |     Args:
117 |         log_file (str, optional): File path of log. Defaults to None.
118 |         log_level (int, optional): The level of logger.
119 |             Defaults to logging.INFO.
120 | 
121 |     Returns:
122 |         :obj:`logging.Logger`: The obtained logger
123 |     """
124 |     logger = get_logger(name='chatllms',
125 |                         log_file=log_file,
126 |                         log_level=log_level)
127 | 
128 |     return logger
129 | 


--------------------------------------------------------------------------------
/source/model/llama2/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import pathlib
  4 | from typing import Tuple
  5 | 
  6 | import torch
  7 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
  8 |                           HfArgumentParser, PreTrainedModel,
  9 |                           PreTrainedTokenizer, Trainer)
 10 | 
 11 | from chatllms.configs import DataArguments, ModelArguments, TrainingArguments
 12 | from chatllms.data import make_supervised_data_module
 13 | from chatllms.utils.model_utils import (add_special_tokens_if_missing,
 14 |                                         safe_save_model_for_hf_trainer)
 15 | 
 16 | 
 17 | def load_model_tokenizer(args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
 18 |     """
 19 |     Load a pre-trained model and tokenizer for natural language processing tasks.
 20 | 
 21 |     Args:
 22 |         args: An object containing the input arguments.
 23 | 
 24 |     Returns:
 25 |         A tuple containing the loaded model and tokenizer.
 26 |     """
 27 |     # Determine the torch data type based on the input arguments
 28 |     torch_dtype = torch.float16 if args.fp16 else (
 29 |         torch.bfloat16 if args.bf16 else torch.float32)
 30 | 
 31 |     config_kwargs = {
 32 |         'cache_dir': args.cache_dir,
 33 |         'use_auth_token': args.use_auth_token,
 34 |         'trust_remote_code': args.trust_remote_code,
 35 |     }
 36 | 
 37 |     # Load the pre-trained model
 38 |     print(f'Loading Model from {args.model_name_or_path}...')
 39 |     model = AutoModelForCausalLM.from_pretrained(
 40 |         args.model_name_or_path,
 41 |         torch_dtype=torch_dtype,
 42 |         **config_kwargs,
 43 |     )
 44 | 
 45 |     # Enable model parallelism
 46 |     setattr(model, 'model_parallel', True)
 47 |     setattr(model, 'is_parallelizable', True)
 48 | 
 49 |     if args.gradient_checkpointing:
 50 |         logging.warning('Using gradient checkpointing...')
 51 |         model.enable_input_require_grads()
 52 |         model.config.use_cache = False  # Turn off when gradient checkpointing is enabled
 53 | 
 54 |     # Load the tokenizer
 55 |     print(f'Loading tokenizer from {args.model_name_or_path}...')
 56 |     tokenizer = AutoTokenizer.from_pretrained(
 57 |         args.model_name_or_path,
 58 |         padding_side='right',
 59 |         model_max_length=args.model_max_length,
 60 |         use_fast=False,
 61 |         tokenizer_type='llama' if 'llama' in args.model_name_or_path else None,
 62 |         **config_kwargs,
 63 |     )
 64 | 
 65 |     return model, tokenizer
 66 | 
 67 | 
 68 | def train() -> None:
 69 |     """
 70 |     Trains a language model using Hugging Face's Transformers library.
 71 | 
 72 |     Args:
 73 |         model_args (ModelArguments): The arguments for the model configuration.
 74 |         data_args (DataArguments): The arguments for the data configuration.
 75 |         training_args (TrainingArguments): The arguments for the training configuration.
 76 | 
 77 |     Returns:
 78 |         None
 79 | 
 80 |     """
 81 |     parser = HfArgumentParser(
 82 |         (ModelArguments, DataArguments, TrainingArguments))
 83 |     (model_args, data_args,
 84 |      training_args) = parser.parse_args_into_dataclasses()
 85 |     data_args.init_for_training()
 86 |     args = argparse.Namespace(**vars(model_args), **vars(data_args),
 87 |                               **vars(training_args))
 88 |     # load model and tokenizer
 89 |     logging.warning('Loading model and tokenizer...')
 90 |     model, tokenizer = load_model_tokenizer(args=args)
 91 |     logging.warning('Successfully loaded model and tokenizer.')
 92 | 
 93 |     if 'llama' in args.model_name_or_path or 'baichuan' in args.model_name_or_path:
 94 |         logging.warning(
 95 |             f'Adding special tokens for {args.model_name_or_path}.')
 96 |         add_special_tokens_if_missing(tokenizer, model)
 97 | 
 98 |     if 'baichuan' in args.model_name_or_path:
 99 |         # Tie the weights
100 |         model.tie_weights()
101 | 
102 |     # Create a supervised dataset and Trainer, then train the model
103 |     logging.warning('Creating a supervised dataset and DataCollator...')
104 |     data_module = make_supervised_data_module(tokenizer=tokenizer, args=args)
105 | 
106 |     # Initialize the Trainer object and start training
107 |     logging.warning('Initializing Trainer object.')
108 |     trainer = Trainer(
109 |         model=model,
110 |         tokenizer=tokenizer,
111 |         args=training_args,
112 |         **data_module,
113 |     )
114 | 
115 |     logging.warning('Start Training...')
116 |     if list(pathlib.Path(training_args.output_dir).glob('checkpoint-*')):
117 |         trainer.train(resume_from_checkpoint=True)
118 |     else:
119 |         trainer.train()
120 | 
121 |     logging.warning(f'Saving Model to {training_args.output_dir}')
122 |     trainer.save_state()
123 |     # Save the trained model
124 |     safe_save_model_for_hf_trainer(trainer=trainer,
125 |                                    output_dir=training_args.output_dir)
126 | 
127 |     logging.warning('Done.')
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     train()
132 | 


--------------------------------------------------------------------------------
/retrieval_contriever/generate_passage_embeddings.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | 
  9 | import argparse
 10 | import csv
 11 | import logging
 12 | import pickle
 13 | 
 14 | import numpy as np
 15 | import torch
 16 | 
 17 | import transformers
 18 | 
 19 | import src.slurm
 20 | import src.contriever
 21 | import src.utils
 22 | import src.data
 23 | import src.normalize_text
 24 | 
 25 | 
 26 | def embed_passages(args, passages, model, tokenizer):
 27 |     total = 0
 28 |     allids, allembeddings = [], []
 29 |     batch_ids, batch_text = [], []
 30 |     with torch.no_grad():
 31 |         for k, p in enumerate(passages):
 32 |             batch_ids.append(p["id"])
 33 |             if args.no_title or not "title" in p:
 34 |                 text = p["text"]
 35 |             else:
 36 |                 text = p["title"] + " " + p["text"]
 37 |             if args.lowercase:
 38 |                 text = text.lower()
 39 |             if args.normalize_text:
 40 |                 text = src.normalize_text.normalize(text)
 41 |             batch_text.append(text)
 42 | 
 43 |             if len(batch_text) == args.per_gpu_batch_size or k == len(passages) - 1:
 44 | 
 45 |                 encoded_batch = tokenizer.batch_encode_plus(
 46 |                     batch_text,
 47 |                     return_tensors="pt",
 48 |                     max_length=args.passage_maxlength,
 49 |                     padding=True,
 50 |                     truncation=True,
 51 |                 )
 52 | 
 53 |                 encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
 54 |                 embeddings = model(**encoded_batch)
 55 | 
 56 |                 embeddings = embeddings.cpu()
 57 |                 total += len(batch_ids)
 58 |                 allids.extend(batch_ids)
 59 |                 allembeddings.append(embeddings)
 60 | 
 61 |                 batch_text = []
 62 |                 batch_ids = []
 63 |                 if k % 100000 == 0 and k > 0:
 64 |                     print(f"Encoded passages {total}")
 65 | 
 66 |     allembeddings = torch.cat(allembeddings, dim=0).numpy()
 67 |     return allids, allembeddings
 68 | 
 69 | 
 70 | def main(args):
 71 |     model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
 72 |     print(f"Model loaded from {args.model_name_or_path}.", flush=True)
 73 |     model.eval()
 74 |     model = model.cuda()
 75 |     if not args.no_fp16:
 76 |         model = model.half()
 77 | 
 78 |     passages = src.data.load_passages(args.passages)
 79 | 
 80 |     shard_size = len(passages) // args.num_shards
 81 |     start_idx = args.shard_id * shard_size
 82 |     end_idx = start_idx + shard_size
 83 |     if args.shard_id == args.num_shards - 1:
 84 |         end_idx = len(passages)
 85 | 
 86 |     passages = passages[start_idx:end_idx]
 87 |     print(f"Embedding generation for {len(passages)} passages from idx {start_idx} to {end_idx}.")
 88 | 
 89 |     allids, allembeddings = embed_passages(args, passages, model, tokenizer)
 90 | 
 91 |     save_file = os.path.join(args.output_dir, args.prefix + f"_{args.shard_id:02d}")
 92 |     os.makedirs(args.output_dir, exist_ok=True)
 93 |     print(f"Saving {len(allids)} passage embeddings to {save_file}.")
 94 |     with open(save_file, mode="wb") as f:
 95 |         pickle.dump((allids, allembeddings), f)
 96 | 
 97 |     print(f"Total passages processed {len(allids)}. Written to {save_file}.")
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     parser = argparse.ArgumentParser()
102 | 
103 |     parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
104 |     parser.add_argument("--output_dir", type=str, default="wikipedia_embeddings", help="dir path to save embeddings")
105 |     parser.add_argument("--prefix", type=str, default="passages", help="prefix path to save embeddings")
106 |     parser.add_argument("--shard_id", type=int, default=0, help="Id of the current shard")
107 |     parser.add_argument("--num_shards", type=int, default=1, help="Total number of shards")
108 |     parser.add_argument(
109 |         "--per_gpu_batch_size", type=int, default=512, help="Batch size for the passage encoder forward pass"
110 |     )
111 |     parser.add_argument("--passage_maxlength", type=int, default=512, help="Maximum number of tokens in a passage")
112 |     parser.add_argument(
113 |         "--model_name_or_path", type=str, help="path to directory containing model weights and config file"
114 |     )
115 |     parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
116 |     parser.add_argument("--no_title", action="store_true", help="title not added to the passage body")
117 |     parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
118 |     parser.add_argument("--normalize_text", action="store_true", help="lowercase text before encoding")
119 | 
120 |     args = parser.parse_args()
121 | 
122 |     src.slurm.init_distributed_mode(args)
123 | 
124 |     main(args)
125 | 


--------------------------------------------------------------------------------
/source/model/llama2/cli_demo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | from threading import Thread
  4 | from typing import List, Tuple
  5 | 
  6 | import torch
  7 | import transformers
  8 | from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
  9 |                           PreTrainedTokenizer, TextIteratorStreamer)
 10 | 
 11 | from chatllms.configs import GenerationArguments, ModelInferenceArguments
 12 | from chatllms.utils.model_utils import get_logits_processor
 13 | from chatllms.utils.template import PromptTemplate
 14 | 
 15 | 
 16 | def generate_response(
 17 |     query: str,
 18 |     history: List[Tuple[str, str]],
 19 |     prefix: str,
 20 |     prompt_template: PromptTemplate,
 21 |     tokenizer: PreTrainedTokenizer,
 22 |     model: PreTrainedModel,
 23 |     generation_args: dict,
 24 | ) -> List[str]:
 25 |     """
 26 |     Generates a response to the given query using GPT-3.5 model and prints it to the console.
 27 | 
 28 |     Args:
 29 |         query (str): The input query for which a response is to be generated.
 30 |         history (List[Tuple[str, str]]): A list of previous queries and their responses.
 31 |         prefix (str): The prefix string added to the beginning of each input sequence.
 32 |         prompt_template (PromptTemplate): The prompt template used to generate the input sequence to the model.
 33 |         tokenizer (PreTrainedTokenizer): The tokenizer used to convert the raw text into input tokens.
 34 |         model (PreTrainedModel): The GPT-3.5 model used to generate the response.
 35 |         generation_args (dict): A dictionary containing the arguments to be passed to the generate() method of the model.
 36 | 
 37 |     Returns:
 38 |         List[Tuple[str, str]]: A list of all the previous queries and their responses, including the current one.
 39 |     """
 40 | 
 41 |     # Convert the query and history into input IDs
 42 |     input_text = prompt_template.get_prompt(query, history, prefix)
 43 |     inputs = tokenizer(input_text, return_tensors='pt')
 44 |     inputs = {k: v.to(model.device) for k, v in inputs.items()}
 45 | 
 46 |     # Create a TextIteratorStreamer object to stream the response from the model
 47 |     streamer = TextIteratorStreamer(tokenizer,
 48 |                                     timeout=60.0,
 49 |                                     skip_prompt=True,
 50 |                                     skip_special_tokens=True)
 51 | 
 52 |     # Set the arguments for the model's generate() method
 53 |     gen_kwargs = dict(
 54 |         inputs,
 55 |         streamer=streamer,
 56 |         logits_processor=get_logits_processor(),
 57 |         **generation_args.to_dict(),
 58 |     )
 59 | 
 60 |     # Start a separate thread to generate the response asynchronously
 61 |     thread = Thread(target=model.generate, kwargs=gen_kwargs)
 62 |     thread.start()
 63 | 
 64 |     # Print the model name and the response as it is generated
 65 |     print('Assistant: ', end='', flush=True)
 66 |     response = ''
 67 |     for new_text in streamer:
 68 |         print(new_text, end='', flush=True)
 69 |         response += new_text
 70 |     print()
 71 | 
 72 |     # Update the history with the current query and response and return it
 73 |     history.append((query, response))
 74 |     return history
 75 | 
 76 | 
 77 | def main():
 78 |     os_name = platform.system()
 79 |     clear_command = 'cls' if os_name == 'Windows' else 'clear'
 80 | 
 81 |     # Parse command-line arguments
 82 |     parser = transformers.HfArgumentParser(
 83 |         (ModelInferenceArguments, GenerationArguments))
 84 |     model_server_args, generation_args = parser.parse_args_into_dataclasses()
 85 | 
 86 |     # Load the model and tokenizer
 87 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 88 | 
 89 |     model = AutoModelForCausalLM.from_pretrained(
 90 |         model_server_args.model_name_or_path,
 91 |         trust_remote_code=True,
 92 |         low_cpu_mem_usage=True,
 93 |         torch_dtype=torch.float16,
 94 |         device_map='auto').to(device).eval()
 95 | 
 96 |     tokenizer = AutoTokenizer.from_pretrained(
 97 |         model_server_args.model_name_or_path,
 98 |         trust_remote_code=True,
 99 |         use_fast=False,
100 |     )
101 | 
102 |     prompt_template = PromptTemplate(model_server_args.prompt_template)
103 |     prefix = model_server_args.source_prefix if model_server_args.source_prefix else ''
104 |     history: List[str] = []
105 |     print('欢迎使用 CLI 对话系统，输入内容即可对话，clear 清空对话历史，stop 终止程序')
106 |     while True:
107 |         try:
108 |             query = input('\nUser: ')
109 |         except UnicodeDecodeError:
110 |             print(
111 |                 'Detected decoding error at the inputs, please set the terminal encoding to utf-8.'
112 |             )
113 |             continue
114 |         if query.strip() == 'stop':
115 |             break
116 | 
117 |         if query.strip() == 'clear':
118 |             # Clear the conversation history
119 |             history = []
120 |             os.system(clear_command)
121 |             print('欢迎使用 CLI 对话系统，输入内容即可对话，clear 清空对话历史，stop 终止程序')
122 |             continue
123 | 
124 |         # Perform prediction and printing
125 |         history = generate_response(query, history, prefix, prompt_template,
126 |                                     tokenizer, model, generation_args)
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     main()
131 | 


--------------------------------------------------------------------------------
/retrieval_contriever/src/normalize_text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | adapted from chemdataextractor.text.normalize
  3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  4 | Tools for normalizing text.
  5 | https://github.com/mcs07/ChemDataExtractor
  6 | :copyright: Copyright 2016 by Matt Swain.
  7 | :license: MIT
  8 | 
  9 | Permission is hereby granted, free of charge, to any person obtaining
 10 | a copy of this software and associated documentation files (the
 11 | 'Software'), to deal in the Software without restriction, including
 12 | without limitation the rights to use, copy, modify, merge, publish,
 13 | distribute, sublicense, and/or sell copies of the Software, and to
 14 | permit persons to whom the Software is furnished to do so, subject to
 15 | the following conditions:
 16 | 
 17 | The above copyright notice and this permission notice shall be
 18 | included in all copies or substantial portions of the Software.
 19 | 
 20 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
 21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 25 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 26 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 | """
 28 | 
 29 | #: Control characters.
 30 | CONTROLS = {
 31 |     '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
 32 |     '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
 33 | }
 34 | # There are further control characters, but they are instead replaced with a space by unicode normalization
 35 | # '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c',  '\u001d', '\u001e', '\u001f'
 36 | 
 37 | 
 38 | #: Hyphen and dash characters.
 39 | HYPHENS = {
 40 |     '-',  # \u002d Hyphen-minus
 41 |     '‐',  # \u2010 Hyphen
 42 |     '‑',  # \u2011 Non-breaking hyphen
 43 |     '⁃',  # \u2043 Hyphen bullet
 44 |     '‒',  # \u2012 figure dash
 45 |     '–',  # \u2013 en dash
 46 |     '—',  # \u2014 em dash
 47 |     '―',  # \u2015 horizontal bar
 48 | }
 49 | 
 50 | #: Minus characters.
 51 | MINUSES = {
 52 |     '-',  # \u002d Hyphen-minus
 53 |     '−',  # \u2212 Minus
 54 |     '－',  # \uff0d Full-width Hyphen-minus
 55 |     '⁻',  # \u207b Superscript minus
 56 | }
 57 | 
 58 | #: Plus characters.
 59 | PLUSES = {
 60 |     '+',  # \u002b Plus
 61 |     '＋',  # \uff0b Full-width Plus
 62 |     '⁺',  # \u207a Superscript plus
 63 | }
 64 | 
 65 | #: Slash characters.
 66 | SLASHES = {
 67 |     '/',  # \u002f Solidus
 68 |     '⁄',  # \u2044 Fraction slash
 69 |     '∕',  # \u2215 Division slash
 70 | }
 71 | 
 72 | #: Tilde characters.
 73 | TILDES = {
 74 |     '~',  # \u007e Tilde
 75 |     '˜',  # \u02dc Small tilde
 76 |     '⁓',  # \u2053 Swung dash
 77 |     '∼',  # \u223c Tilde operator #in mbert vocab
 78 |     '∽',  # \u223d Reversed tilde
 79 |     '∿',  # \u223f Sine wave
 80 |     '〜',  # \u301c Wave dash #in mbert vocab
 81 |     '～',  # \uff5e Full-width tilde #in mbert vocab
 82 | }
 83 | 
 84 | #: Apostrophe characters.
 85 | APOSTROPHES = {
 86 |     "'",  # \u0027
 87 |     '’',  # \u2019
 88 |     '՚',  # \u055a
 89 |     'Ꞌ',  # \ua78b
 90 |     'ꞌ',  # \ua78c
 91 |     '＇',  # \uff07
 92 | }
 93 | 
 94 | #: Single quote characters.
 95 | SINGLE_QUOTES = {
 96 |     "'",  # \u0027
 97 |     '‘',  # \u2018
 98 |     '’',  # \u2019
 99 |     '‚',  # \u201a
100 |     '‛',  # \u201b
101 | 
102 | }
103 | 
104 | #: Double quote characters.
105 | DOUBLE_QUOTES = {
106 |     '"',  # \u0022
107 |     '“',  # \u201c
108 |     '”',  # \u201d
109 |     '„',  # \u201e
110 |     '‟',  # \u201f
111 | }
112 | 
113 | #: Accent characters.
114 | ACCENTS = {
115 |     '`',  # \u0060
116 |     '´',  # \u00b4
117 | }
118 | 
119 | #: Prime characters.
120 | PRIMES = {
121 |     '′',  # \u2032
122 |     '″',  # \u2033
123 |     '‴',  # \u2034
124 |     '‵',  # \u2035
125 |     '‶',  # \u2036
126 |     '‷',  # \u2037
127 |     '⁗',  # \u2057
128 | }
129 | 
130 | #: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
131 | QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
132 | 
133 | def normalize(text):
134 |     for control in CONTROLS:
135 |         text = text.replace(control, '')
136 |     text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
137 | 
138 |     for hyphen in HYPHENS | MINUSES:
139 |         text = text.replace(hyphen, '-')
140 |     text = text.replace('\u00ad', '')
141 | 
142 |     for double_quote in DOUBLE_QUOTES:
143 |         text = text.replace(double_quote, '"')  # \u0022
144 |     for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
145 |         text = text.replace(single_quote, "'")  # \u0027
146 |     text = text.replace('′', "'")     # \u2032 prime
147 |     text = text.replace('‵', "'")     # \u2035 reversed prime
148 |     text = text.replace('″', "''")    # \u2033 double prime
149 |     text = text.replace('‶', "''")    # \u2036 reversed double prime
150 |     text = text.replace('‴', "'''")   # \u2034 triple prime
151 |     text = text.replace('‷', "'''")   # \u2037 reversed triple prime
152 |     text = text.replace('⁗', "''''")  # \u2057 quadruple prime
153 | 
154 |     text = text.replace('…', '...').replace(' . . . ', ' ... ')  # \u2026
155 | 
156 |     for slash in SLASHES:
157 |         text = text.replace(slash, '/')
158 | 
159 |     #for tilde in TILDES:
160 |     #    text = text.replace(tilde, '~')
161 | 
162 |     return text
163 | 


--------------------------------------------------------------------------------
/retrieval_contriever/src/moco.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import logging
  6 | import copy
  7 | import transformers
  8 | 
  9 | from src import contriever, dist_utils, utils
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class MoCo(nn.Module):
 15 |     def __init__(self, opt):
 16 |         super(MoCo, self).__init__()
 17 | 
 18 |         self.queue_size = opt.queue_size
 19 |         self.momentum = opt.momentum
 20 |         self.temperature = opt.temperature
 21 |         self.label_smoothing = opt.label_smoothing
 22 |         self.norm_doc = opt.norm_doc
 23 |         self.norm_query = opt.norm_query
 24 |         self.moco_train_mode_encoder_k = opt.moco_train_mode_encoder_k  # apply the encoder on keys in train mode
 25 | 
 26 |         retriever, tokenizer = self._load_retriever(
 27 |             opt.retriever_model_id, pooling=opt.pooling, random_init=opt.random_init
 28 |         )
 29 | 
 30 |         self.tokenizer = tokenizer
 31 |         self.encoder_q = retriever
 32 |         self.encoder_k = copy.deepcopy(retriever)
 33 | 
 34 |         for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
 35 |             param_k.data.copy_(param_q.data)
 36 |             param_k.requires_grad = False
 37 | 
 38 |         # create the queue
 39 |         self.register_buffer("queue", torch.randn(opt.projection_size, self.queue_size))
 40 |         self.queue = nn.functional.normalize(self.queue, dim=0)
 41 | 
 42 |         self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
 43 | 
 44 |     def _load_retriever(self, model_id, pooling, random_init):
 45 |         cfg = utils.load_hf(transformers.AutoConfig, model_id)
 46 |         tokenizer = utils.load_hf(transformers.AutoTokenizer, model_id)
 47 | 
 48 |         if "xlm" in model_id:
 49 |             model_class = contriever.XLMRetriever
 50 |         else:
 51 |             model_class = contriever.Contriever
 52 | 
 53 |         if random_init:
 54 |             retriever = model_class(cfg)
 55 |         else:
 56 |             retriever = utils.load_hf(model_class, model_id)
 57 | 
 58 |         if "bert-" in model_id:
 59 |             if tokenizer.bos_token_id is None:
 60 |                 tokenizer.bos_token = "[CLS]"
 61 |             if tokenizer.eos_token_id is None:
 62 |                 tokenizer.eos_token = "[SEP]"
 63 | 
 64 |         retriever.config.pooling = pooling
 65 | 
 66 |         return retriever, tokenizer
 67 | 
 68 |     def get_encoder(self, return_encoder_k=False):
 69 |         if return_encoder_k:
 70 |             return self.encoder_k
 71 |         else:
 72 |             return self.encoder_q
 73 | 
 74 |     def _momentum_update_key_encoder(self):
 75 |         """
 76 |         Update of the key encoder
 77 |         """
 78 |         for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
 79 |             param_k.data = param_k.data * self.momentum + param_q.data * (1.0 - self.momentum)
 80 | 
 81 |     @torch.no_grad()
 82 |     def _dequeue_and_enqueue(self, keys):
 83 |         # gather keys before updating queue
 84 |         keys = dist_utils.gather_nograd(keys.contiguous())
 85 | 
 86 |         batch_size = keys.shape[0]
 87 | 
 88 |         ptr = int(self.queue_ptr)
 89 |         assert self.queue_size % batch_size == 0, f"{batch_size}, {self.queue_size}"  # for simplicity
 90 | 
 91 |         # replace the keys at ptr (dequeue and enqueue)
 92 |         self.queue[:, ptr : ptr + batch_size] = keys.T
 93 |         ptr = (ptr + batch_size) % self.queue_size  # move pointer
 94 | 
 95 |         self.queue_ptr[0] = ptr
 96 | 
 97 |     def _compute_logits(self, q, k):
 98 |         l_pos = torch.einsum("nc,nc->n", [q, k]).unsqueeze(-1)
 99 |         l_neg = torch.einsum("nc,ck->nk", [q, self.queue.clone().detach()])
100 | 
101 |         logits = torch.cat([l_pos, l_neg], dim=1)
102 |         return logits
103 | 
104 |     def forward(self, q_tokens, q_mask, k_tokens, k_mask, stats_prefix="", iter_stats={}, **kwargs):
105 |         bsz = q_tokens.size(0)
106 | 
107 |         q = self.encoder_q(input_ids=q_tokens, attention_mask=q_mask, normalize=self.norm_query)
108 | 
109 |         # compute key features
110 |         with torch.no_grad():  # no gradient to keys
111 |             self._momentum_update_key_encoder()  # update the key encoder
112 | 
113 |             if not self.encoder_k.training and not self.moco_train_mode_encoder_k:
114 |                 self.encoder_k.eval()
115 | 
116 |             k = self.encoder_k(input_ids=k_tokens, attention_mask=k_mask, normalize=self.norm_doc)
117 | 
118 |         logits = self._compute_logits(q, k) / self.temperature
119 | 
120 |         # labels: positive key indicators
121 |         labels = torch.zeros(bsz, dtype=torch.long).cuda()
122 | 
123 |         loss = torch.nn.functional.cross_entropy(logits, labels, label_smoothing=self.label_smoothing)
124 | 
125 |         self._dequeue_and_enqueue(k)
126 | 
127 |         # log stats
128 |         if len(stats_prefix) > 0:
129 |             stats_prefix = stats_prefix + "/"
130 |         iter_stats[f"{stats_prefix}loss"] = (loss.item(), bsz)
131 | 
132 |         predicted_idx = torch.argmax(logits, dim=-1)
133 |         accuracy = 100 * (predicted_idx == labels).float().mean()
134 |         stdq = torch.std(q, dim=0).mean().item()
135 |         stdk = torch.std(k, dim=0).mean().item()
136 |         iter_stats[f"{stats_prefix}accuracy"] = (accuracy, bsz)
137 |         iter_stats[f"{stats_prefix}stdq"] = (stdq, bsz)
138 |         iter_stats[f"{stats_prefix}stdk"] = (stdk, bsz)
139 | 
140 |         return loss, iter_stats
141 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/convert_alpaca.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from datasets import load_dataset
  4 | 
  5 | 
  6 | def json_dump(obj, path):
  7 |     with open(path, 'w', encoding='utf-8') as f:
  8 |         json.dump(obj, f, indent=2, ensure_ascii=False)
  9 | 
 10 | 
 11 | def json_load(in_file):
 12 |     with open(in_file, 'r') as f:
 13 |         json_data = json.load(f)
 14 |     return json_data
 15 | 
 16 | 
 17 | def convert_100PoisonMpts(in_file, out_file):
 18 |     raw_data = load_dataset('json', data_files=in_file)['train']
 19 |     new_content = []
 20 |     for i, raw_text in enumerate(raw_data):
 21 |         prompt = raw_text['prompt']
 22 |         response = raw_text['answer']
 23 |         if len(prompt) <= 5 or len(response) <= 5:
 24 |             continue
 25 |         new_content.append({
 26 |             'instruction': prompt,
 27 |             'input': '',
 28 |             'output': response,
 29 |         })
 30 | 
 31 |     print(f'#out: {len(new_content)}')
 32 |     json_dump(new_content, out_file)
 33 | 
 34 | 
 35 | def convert_Cvalues(in_file, out_file):
 36 |     raw_data = load_dataset('json', data_files=in_file)['train']
 37 |     new_content = []
 38 |     for i, raw_text in enumerate(raw_data):
 39 |         prompt = raw_text['prompt']
 40 |         response = raw_text['pos_resp']
 41 |         if len(prompt) <= 5 or len(response) <= 5:
 42 |             continue
 43 |         new_content.append({
 44 |             'instruction': prompt,
 45 |             'input': '',
 46 |             'output': response,
 47 |         })
 48 | 
 49 |     print(f'#out: {len(new_content)}')
 50 |     json_dump(new_content, out_file)
 51 | 
 52 | 
 53 | def convert_huatuogpt(in_file, out_file):
 54 |     raw_data = load_dataset('json', data_files=in_file)['train']
 55 |     new_content = []
 56 |     for i, raw_text in enumerate(raw_data):
 57 |         data = raw_text['data']
 58 |         prompt = data[0].replace('问：', '')
 59 |         response = data[1].replace('答：', '')
 60 |         if len(prompt) <= 5 or len(response) <= 5:
 61 |             continue
 62 |         new_content.append({
 63 |             'instruction': prompt,
 64 |             'input': '',
 65 |             'output': response,
 66 |         })
 67 |     print(f'#out: {len(new_content)}')
 68 |     json_dump(new_content, out_file)
 69 | 
 70 | 
 71 | def convert_safety_attack(in_file, out_file):
 72 |     field_list = [
 73 |         'Reverse_Exposure', 'Goal_Hijacking', 'Prompt_Leaking',
 74 |         'Unsafe_Instruction_Topic', 'Role_Play_Instruction',
 75 |         'Inquiry_With_Unsafe_Opinion'
 76 |     ]
 77 |     new_content = []
 78 |     for filed in field_list:
 79 |         raw_data = load_dataset('json', field=filed,
 80 |                                 data_files=in_file)['train']
 81 |         for i, raw_text in enumerate(raw_data):
 82 |             prompt = raw_text['prompt']
 83 |             response = raw_text['response']
 84 |             if len(prompt) <= 5 or len(response) <= 5:
 85 |                 continue
 86 |             new_content.append({
 87 |                 'instruction': prompt,
 88 |                 'input': '',
 89 |                 'output': response,
 90 |             })
 91 |     print(f'#out: {len(new_content)}')
 92 |     json_dump(new_content, out_file)
 93 | 
 94 | 
 95 | def convert_safety_scenarios(in_file, out_file):
 96 | 
 97 |     field_list = [
 98 |         'Unfairness_And_Discrimination', 'Crimes_And_Illegal_Activities',
 99 |         'Insult', 'Mental_Health', 'Physical_Harm', 'Privacy_And_Property',
100 |         'Ethics_And_Morality'
101 |     ]
102 |     new_content = []
103 |     for filed in field_list:
104 |         raw_data = load_dataset('json', data_files=in_file,
105 |                                 field=filed)['train']
106 |         for i, raw_text in enumerate(raw_data):
107 |             prompt = raw_text['prompt']
108 |             response = raw_text['response']
109 |             if len(prompt) <= 5 or len(response) <= 5:
110 |                 continue
111 |             new_content.append({
112 |                 'instruction': prompt,
113 |                 'input': '',
114 |                 'output': response,
115 |             })
116 |     print(f'#out: {len(new_content)}')
117 |     json_dump(new_content, out_file)
118 | 
119 | 
120 | if __name__ == '__main__':
121 | 
122 |     data_path = '/home/robin/prompt_data/100PoisonMpts/train.jsonl'
123 |     out_path = '/home/robin/prompt_data/100PoisonMpts/train_alpaca.jsonl'
124 |     convert_100PoisonMpts(data_path, out_file=out_path)
125 | 
126 |     data_path = '/home/robin/prompt_data/CValues-Comparison/test.jsonl'
127 |     out_path = '/home/robin/prompt_data/CValues-Comparison/test_alpaca.json'
128 |     convert_Cvalues(data_path, out_file=out_path)
129 | 
130 |     data_path = '/home/robin/prompt_data/CValues-Comparison/train.jsonl'
131 |     out_path = '/home/robin/prompt_data/CValues-Comparison/train_alpaca.json'
132 |     convert_Cvalues(data_path, out_file=out_path)
133 | 
134 |     data_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_sft_data_v1.jsonl'
135 |     out_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json'
136 |     convert_huatuogpt(data_path, out_file=out_path)
137 | 
138 |     data_path = '/home/robin/prompt_data/Safety-Prompts/instruction_attack_scenarios.json'
139 |     out_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json'
140 |     convert_safety_attack(data_path, out_file=out_path)
141 | 
142 |     data_path = '/home/robin/prompt_data/Safety-Prompts/typical_safety_scenarios.json'
143 |     out_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json'
144 |     convert_safety_scenarios(data_path, out_file=out_path)
145 | 


--------------------------------------------------------------------------------
/source/model/llama2/data/dataset_info.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | 
  3 | 
  4 | def get_dataset_info(dataset_dir):
  5 |     """
  6 |     Returns the datasets info to a dataset based on a pre-defined map of dataset names to their corresponding URLs on the internet
  7 |     or local file paths.
  8 | 
  9 |     Args:
 10 |         dataset_dir (str): The local directory where the dataset is stored; this is used for datasets that are stored locally.
 11 | 
 12 |     Returns:
 13 |         str: The dataset dict to the specified dataset.
 14 |     """
 15 |     dataset_info = {
 16 |         'alpaca': {
 17 |             'hf_hub_url': 'tatsu-lab/alpaca',
 18 |             'local_path': 'tatsu-lab/alpaca/alpaca.json',
 19 |             'multi_turn': False
 20 |         },
 21 |         'alpaca-clean': {
 22 |             'hf_hub_url': 'yahma/alpaca-cleaned',
 23 |             'local_path': '',
 24 |             'multi_turn': False
 25 |         },
 26 |         'chip2': {
 27 |             'hf_hub_url': 'laion/OIG',
 28 |             'local_path': '',
 29 |             'multi_turn': False
 30 |         },
 31 |         'self-instruct': {
 32 |             'hf_hub_url': 'yizhongw/self_instruct',
 33 |             'local_path': '',
 34 |             'multi_turn': False
 35 |         },
 36 |         'guanaco': {
 37 |             'hf_hub_url': 'JosephusCheung/GuanacoDataset',
 38 |             'local_path': '',
 39 |             'multi_turn': False
 40 |         },
 41 |         'hh-rlhf': {
 42 |             'hf_hub_url': 'Anthropic/hh-rlhf',
 43 |             'local_path': '',
 44 |             'multi_turn': False
 45 |         },
 46 |         'longformer': {
 47 |             'hf_hub_url': 'akoksal/LongForm',
 48 |             'local_path': '',
 49 |             'multi_turn': False
 50 |         },
 51 |         'openassistant-guanaco': {
 52 |             'hf_hub_url':
 53 |             'timdettmers/openassistant-guanaco',
 54 |             'local_path':
 55 |             join(dataset_dir,
 56 |                  'timdettmers/openassistant_best_replies_train.jsonl'),
 57 |             'multi_turn':
 58 |             False
 59 |         },
 60 |         'evol_instruct': {
 61 |             'hf_hub_url':
 62 |             'WizardLM/WizardLM_evol_instruct_V2_196k',
 63 |             'local_path':
 64 |             join(dataset_dir, 'WizardLM/WizardLM_evol_instruct_V2_143k.json'),
 65 |             'multi_turn':
 66 |             False
 67 |         },
 68 |         'dolly-15k': {
 69 |             'hf_hub_url': 'databricks/databricks-dolly-15k',
 70 |             'local_path': join(dataset_dir, 'databricks/databricks-dolly-15k'),
 71 |             'multi_turn': False
 72 |         },
 73 |         'olcc': {
 74 |             'hf_hub_url': 'yizhongw/olcc',
 75 |             'local_path': join(dataset_dir, 'olcc/olcc_alpaca.json'),
 76 |             'multi_turn': False
 77 |         },
 78 |         'share_gpt': {
 79 |             'hf_hub_url': '',
 80 |             'local_path': join(dataset_dir, 'sharegpt/sharegpt_split.json'),
 81 |             'multi_turn': True
 82 |         },
 83 |         '100PoisonMpts': {
 84 |             'hf_hub_url': '',
 85 |             'local_path': join(dataset_dir, '100PoisonMpts/train.jsonl'),
 86 |             'multi_turn': False
 87 |         },
 88 |         'belle_0.5m': {
 89 |             'hf_hub_url': 'BelleGroup/train_0.5M_CN',
 90 |             'local_path': '',
 91 |             'multi_turn': False
 92 |         },
 93 |         'belle_1m': {
 94 |             'hf_hub_url': 'BelleGroup/train_1M_CN',
 95 |             'local_path': '',
 96 |             'multi_turn': False
 97 |         },
 98 |         'belle_2m': {
 99 |             'hf_hub_url': 'BelleGroup/train_2M_CN',
100 |             'local_path': '',
101 |             'multi_turn': False
102 |         },
103 |         'belle_dialog': {
104 |             'hf_hub_url': 'BelleGroup/generated_chat_0.4M',
105 |             'local_path': '',
106 |             'multi_turn': False
107 |         },
108 |         'belle_math': {
109 |             'hf_hub_url': 'BelleGroup/school_math_0.25M',
110 |             'local_path': '',
111 |             'multi_turn': False
112 |         },
113 |         'belle_multiturn': {
114 |             'hf_hub_url': 'BelleGroup/multi_turn_0.5M',
115 |             'local_path': '',
116 |             'multi_turn': True,
117 |             'columns': {
118 |                 'prompt': 'instruction',
119 |                 'query': '',
120 |                 'response': 'output',
121 |                 'history': 'history'
122 |             }
123 |         },
124 |         'firefly': {
125 |             'hf_hub_url': 'YeungNLP/firefly-train-1.1M',
126 |             'local_path': '',
127 |             'multi_turn': False,
128 |             'columns': {
129 |                 'prompt': 'input',
130 |                 'query': '',
131 |                 'response': 'target',
132 |                 'history': ''
133 |             }
134 |         },
135 |         'codealpaca': {
136 |             'hf_hub_url': 'sahil2801/CodeAlpaca-20k',
137 |             'local_path': '',
138 |             'multi_turn': False
139 |         },
140 |         'alpaca_cot': {
141 |             'hf_hub_url': 'QingyiSi/Alpaca-CoT',
142 |             'local_path': '',
143 |             'multi_turn': False
144 |         },
145 |         'webqa': {
146 |             'hf_hub_url': 'suolyer/webqa',
147 |             'local_path': '',
148 |             'multi_turn': False,
149 |             'columns': {
150 |                 'prompt': 'input',
151 |                 'query': '',
152 |                 'response': 'output',
153 |                 'history': ''
154 |             }
155 |         },
156 |         'novel_tokens512_50k': {
157 |             'hf_hub_url': 'zxbsmk/webnovel_cn',
158 |             'local_path': '',
159 |             'multi_turn': False
160 |         }
161 |     }
162 | 
163 |     return dataset_info
164 | 


--------------------------------------------------------------------------------
/retrieval_contriever/src/contriever.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | 
  3 | import os
  4 | import torch
  5 | import transformers
  6 | from transformers import BertModel, XLMRobertaModel
  7 | 
  8 | from retrieval_contriever.src import utils
  9 | 
 10 | 
 11 | class Contriever(BertModel):
 12 |     def __init__(self, config, pooling="average", **kwargs):
 13 |         super().__init__(config, add_pooling_layer=False)
 14 |         if not hasattr(config, "pooling"):
 15 |             self.config.pooling = pooling
 16 | 
 17 |     def forward(
 18 |         self,
 19 |         input_ids=None,
 20 |         attention_mask=None,
 21 |         token_type_ids=None,
 22 |         position_ids=None,
 23 |         head_mask=None,
 24 |         inputs_embeds=None,
 25 |         encoder_hidden_states=None,
 26 |         encoder_attention_mask=None,
 27 |         output_attentions=None,
 28 |         output_hidden_states=None,
 29 |         normalize=False,
 30 |     ):
 31 | 
 32 |         model_output = super().forward(
 33 |             input_ids=input_ids,
 34 |             attention_mask=attention_mask,
 35 |             token_type_ids=token_type_ids,
 36 |             position_ids=position_ids,
 37 |             head_mask=head_mask,
 38 |             inputs_embeds=inputs_embeds,
 39 |             encoder_hidden_states=encoder_hidden_states,
 40 |             encoder_attention_mask=encoder_attention_mask,
 41 |             output_attentions=output_attentions,
 42 |             output_hidden_states=output_hidden_states,
 43 |         )
 44 | 
 45 |         last_hidden = model_output["last_hidden_state"]
 46 |         last_hidden = last_hidden.masked_fill(~attention_mask[..., None].bool(), 0.0)
 47 | 
 48 |         if self.config.pooling == "average":
 49 |             emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 50 |         elif self.config.pooling == "cls":
 51 |             emb = last_hidden[:, 0]
 52 | 
 53 |         if normalize:
 54 |             emb = torch.nn.functional.normalize(emb, dim=-1)
 55 |         return emb
 56 | 
 57 | 
 58 | class XLMRetriever(XLMRobertaModel):
 59 |     def __init__(self, config, pooling="average", **kwargs):
 60 |         super().__init__(config, add_pooling_layer=True)
 61 |         if not hasattr(config, "pooling"):
 62 |             self.config.pooling = pooling
 63 | 
 64 |     def forward(
 65 |         self,
 66 |         input_ids=None,
 67 |         attention_mask=None,
 68 |         token_type_ids=None,
 69 |         position_ids=None,
 70 |         head_mask=None,
 71 |         inputs_embeds=None,
 72 |         encoder_hidden_states=None,
 73 |         encoder_attention_mask=None,
 74 |         output_attentions=None,
 75 |         output_hidden_states=None,
 76 |         normalize=False,
 77 |     ):
 78 | 
 79 |         model_output = super().forward(
 80 |             input_ids=input_ids,
 81 |             attention_mask=attention_mask,
 82 |             token_type_ids=token_type_ids,
 83 |             position_ids=position_ids,
 84 |             head_mask=head_mask,
 85 |             inputs_embeds=inputs_embeds,
 86 |             encoder_hidden_states=encoder_hidden_states,
 87 |             encoder_attention_mask=encoder_attention_mask,
 88 |             output_attentions=output_attentions,
 89 |             output_hidden_states=output_hidden_states,
 90 |         )
 91 | 
 92 |         last_hidden = model_output["last_hidden_state"]
 93 |         last_hidden = last_hidden.masked_fill(~attention_mask[..., None].bool(), 0.0)
 94 |         if self.config.pooling == "average":
 95 |             emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 96 |         elif self.config.pooling == "cls":
 97 |             emb = last_hidden[:, 0]
 98 |         if normalize:
 99 |             emb = torch.nn.functional.normalize(emb, dim=-1)
100 |         return emb
101 | 
102 | 
103 | def load_retriever(model_path, pooling="average", random_init=False):
104 |     # try: check if model exists locally
105 |     path = os.path.join(model_path, "checkpoint.pth")
106 |     if os.path.exists(path):
107 |         pretrained_dict = torch.load(path, map_location="gpu")
108 |         opt = pretrained_dict["opt"]
109 |         print(opt.retriever_model_id)
110 |         if hasattr(opt, "retriever_model_id"):
111 |             print("here1")
112 |             retriever_model_id = opt.retriever_model_id
113 |         else:
114 |             # retriever_model_id = "bert-base-uncased"
115 |             print("here")
116 |             retriever_model_id = "bert-base-multilingual-cased"
117 |         tokenizer = utils.load_hf(transformers.AutoTokenizer, retriever_model_id)
118 |         cfg = utils.load_hf(transformers.AutoConfig, retriever_model_id)
119 |         if "xlm" in retriever_model_id:
120 |             model_class = XLMRetriever
121 |         else:
122 |             model_class = Contriever
123 |         retriever = model_class(cfg)
124 |         pretrained_dict = pretrained_dict["model"]
125 | 
126 |         if any("encoder_q." in key for key in pretrained_dict.keys()):  # test if model is defined with moco class
127 |             pretrained_dict = {k.replace("encoder_q.", ""): v for k, v in pretrained_dict.items() if "encoder_q." in k}
128 |         elif any("encoder." in key for key in pretrained_dict.keys()):  # test if model is defined with inbatch class
129 |             pretrained_dict = {k.replace("encoder.", ""): v for k, v in pretrained_dict.items() if "encoder." in k}
130 |         retriever.load_state_dict(pretrained_dict, strict=False)
131 |     else:
132 |         retriever_model_id = model_path
133 |         if "xlm" in retriever_model_id:
134 |             model_class = XLMRetriever
135 |         else:
136 |             model_class = Contriever
137 |         cfg = utils.load_hf(transformers.AutoConfig, model_path)
138 |         tokenizer = utils.load_hf(transformers.AutoTokenizer, model_path)
139 |         retriever = utils.load_hf(model_class, model_path)
140 | 
141 |     return retriever, tokenizer, retriever_model_id
142 | 


--------------------------------------------------------------------------------
/source/model/flan-t5/flan_seq2seq.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pickle
  6 | import nltk
  7 | 
  8 | nltk.download("punkt")
  9 | 
 10 | from transformers import (
 11 |     AutoTokenizer,
 12 |     AutoModelForSeq2SeqLM,
 13 |     DataCollatorForSeq2Seq,
 14 |     Seq2SeqTrainingArguments,
 15 |     Seq2SeqTrainer,
 16 | )
 17 | from peft import (
 18 |     get_peft_model,
 19 |     TaskType,
 20 |     LoraConfig,
 21 |     PrefixTuningConfig,
 22 | )
 23 | 
 24 | from utils import get_data
 25 | 
 26 | 
 27 | def main(args):
 28 |     model_name_or_path = args.pretrained_ckpt
 29 | 
 30 |     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
 31 | 
 32 |     # loading dataset
 33 |     dataset, max_source_length, max_target_length = get_data(tokenizer)
 34 | 
 35 |     def preprocess_function(sample, padding="max_length"):
 36 |         # add prefix to the input for t5
 37 |         inputs = ["query: " + item for item in sample["question"]]
 38 | 
 39 |         # tokenize inputs
 40 |         model_inputs = tokenizer(
 41 |             inputs, max_length=max_source_length, padding=padding, truncation=True
 42 |         )
 43 | 
 44 |         labels = tokenizer(
 45 |             text_target=sample["answer"],
 46 |             max_length=max_target_length,
 47 |             padding=padding,
 48 |             truncation=True,
 49 |         )
 50 | 
 51 |         if padding == "max_length":
 52 |             labels["input_ids"] = [
 53 |                 [(l if l != tokenizer.pad_token_id else -100) for l in label]
 54 |                 for label in labels["input_ids"]
 55 |             ]
 56 | 
 57 |         model_inputs["labels"] = labels["input_ids"]
 58 |         return model_inputs
 59 | 
 60 |     tokenized_dataset = dataset.map(
 61 |         preprocess_function, batched=True, remove_columns=["question", "answer"]
 62 |     )
 63 |     print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
 64 | 
 65 |     print("Getting PEFT method")
 66 | 
 67 |     if args.peft_method == "lora":
 68 |         peft_config = LoraConfig(
 69 |             task_type=TaskType.SEQ_2_SEQ_LM,
 70 |             inference_mode=False,
 71 |             r=args.lora_r,
 72 |             lora_alpha=32,
 73 |             lora_dropout=args.dropout,
 74 |             target_modules=["q", "v"],
 75 |         )
 76 |         results_dir = f"experiments/summarization_{args.peft_method}_epochs-{args.epochs}_r-{args.lora_r}_dropout-{args.dropout}"
 77 | 
 78 |     elif args.peft_method == "prefix":
 79 |         peft_config = PrefixTuningConfig(
 80 |             task_type=TaskType.SEQ_2_SEQ_LM,
 81 |             inference_mode=False,
 82 |             num_virtual_tokens=args.prefix_tokens,
 83 |             prefix_projection=True if args.prefix_projection else False,
 84 |         )
 85 |         results_dir = f"experiments/summarization_{args.peft_method}_epochs-{args.epochs}_prefixTokens-{args.prefix_tokens}_useProjection-{args.prefix_projection}"
 86 | 
 87 |     model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
 88 |     if args.peft_method != "sft":
 89 |         model = get_peft_model(model, peft_config)
 90 |     print(model.print_trainable_parameters())
 91 | 
 92 |     # Define training args
 93 |     training_args = Seq2SeqTrainingArguments(
 94 |         do_train=True,
 95 |         do_eval=True,
 96 |         evaluation_strategy="epoch",
 97 |         logging_strategy="epoch",
 98 |         save_strategy="no",
 99 |         per_device_eval_batch_size=8,
100 |         per_device_train_batch_size=8,
101 |         gradient_accumulation_steps=1,
102 |         output_dir=results_dir,
103 |         auto_find_batch_size=True,
104 |         learning_rate=1e-3,
105 |         num_train_epochs=args.epochs,
106 |         logging_dir=f"{results_dir}/logs",
107 |         report_to="none",
108 |     )
109 | 
110 |     # we want to ignore tokenizer pad token in the loss
111 |     label_pad_token_id = -100
112 |     # Data collator
113 |     data_collator = DataCollatorForSeq2Seq(
114 |         tokenizer,
115 |         model=model,
116 |         label_pad_token_id=label_pad_token_id,
117 |         pad_to_multiple_of=8,
118 |     )
119 | 
120 |     print(f"training_args = {training_args}")
121 |     # Create Trainer instance
122 |     trainer = Seq2SeqTrainer(
123 |         model=model,
124 |         args=training_args,
125 |         train_dataset=tokenized_dataset["train"],
126 |         eval_dataset=tokenized_dataset["validation"],
127 |         data_collator=data_collator,
128 |     )
129 |     model.config.use_cache = False
130 | 
131 |     trainer_stats = trainer.train()
132 |     train_loss = trainer_stats.training_loss
133 |     eval_stats = trainer.evaluate()
134 |     eval_loss = eval_stats["eval_loss"]
135 |     print(f"Training loss:{train_loss}|Val loss:{eval_loss}")
136 | 
137 |     peft_model_id = f"{results_dir}/assets"
138 |     trainer.model.save_pretrained(peft_model_id)
139 |     tokenizer.save_pretrained(peft_model_id)
140 | 
141 |     with open(f"{results_dir}/results.pkl", "wb") as handle:
142 |         run_result = [
143 |             args.epochs,
144 |             args.prefix_tokens,
145 |             args.prefix_projection,
146 |             train_loss,
147 |             eval_loss,
148 |         ]
149 |         pickle.dump(run_result, handle)
150 |     print("Experiment over")
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     parser = argparse.ArgumentParser()
155 |     parser.add_argument("--pretrained_ckpt", default="google/flan-t5-large")
156 |     parser.add_argument("--peft_method", default="sft")
157 |     parser.add_argument("--lora_r", default=16, type=int)
158 |     parser.add_argument("--epochs", default=1, type=int)
159 |     parser.add_argument("--prefix_tokens", default=20, type=int)
160 |     parser.add_argument("--prefix_projection", default=1, type=int)
161 |     parser.add_argument("--dropout", default=0.1, type=float)
162 |     parser.add_argument("--p_tokens", default=20, type=int)
163 |     parser.add_argument("--p_hidden", default=100, type=int)
164 |     parser.add_argument("--prompt_tokens", default=20, type=int)
165 | 
166 |     args = parser.parse_args()
167 |     main(args)
168 | 


--------------------------------------------------------------------------------
/source/model/llama2/examples/finetune_llm/finetune_llama_with_qlora.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | 
  3 | import torch
  4 | import transformers
  5 | from datasets import load_dataset
  6 | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
  7 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
  8 |                           BitsAndBytesConfig, DataCollatorForLanguageModeling,
  9 |                           LlamaTokenizer, Trainer, TrainingArguments)
 10 | 
 11 | DEFAULT_PAD_TOKEN = '[PAD]'
 12 | DEFAULT_EOS_TOKEN = '</s>'
 13 | DEFAULT_BOS_TOKEN = '<s>'
 14 | DEFAULT_UNK_TOKEN = '<unk>'
 15 | 
 16 | 
 17 | def print_trainable_parameters(model: AutoModelForCausalLM) -> None:
 18 |     """
 19 |     Prints the number of trainable parameters in the model.
 20 |     """
 21 |     trainable_params, all_param = 0, 0
 22 |     for _, param in model.named_parameters():
 23 |         all_param += param.numel()
 24 |         if param.requires_grad:
 25 |             trainable_params += param.numel()
 26 |     print(
 27 |         f'trainable params: {trainable_params} || all params: {all_param} || trainable%: \
 28 |             {100 * trainable_params / all_param}')
 29 | 
 30 | 
 31 | def smart_tokenizer_and_embedding_resize(
 32 |     special_tokens_dict: Dict,
 33 |     tokenizer: transformers.PreTrainedTokenizer,
 34 |     model: transformers.PreTrainedModel,
 35 | ):
 36 |     """Resize tokenizer and embedding.
 37 | 
 38 |     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
 39 |     """
 40 |     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
 41 |     model.resize_token_embeddings(len(tokenizer))
 42 | 
 43 |     if num_new_tokens > 0:
 44 |         input_embeddings = model.get_input_embeddings().weight.data
 45 |         output_embeddings = model.get_output_embeddings().weight.data
 46 | 
 47 |         input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
 48 |             dim=0, keepdim=True)
 49 |         output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
 50 |             dim=0, keepdim=True)
 51 | 
 52 |         input_embeddings[-num_new_tokens:] = input_embeddings_avg
 53 |         output_embeddings[-num_new_tokens:] = output_embeddings_avg
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 |     model_id = 'decapoda-research/llama-7b-hf'
 58 |     bnb_config = BitsAndBytesConfig(
 59 |         load_in_4bit=True,
 60 |         bnb_4bit_use_double_quant=True,
 61 |         bnb_4bit_quant_type='nf4',
 62 |         bnb_4bit_compute_dtype=torch.bfloat16,
 63 |     )
 64 |     """
 65 |     - load_in_4bit: The model will be loaded in the memory with 4-bit precision.
 66 |     - bnb_4bit_use_double_quant: We will do the double quantization proposed by QLoRa.
 67 |     - bnb_4bit_quant_type: This is the type of quantization. “nf4” stands for 4-bit NormalFloat.
 68 |     - bnb_4bit_compute_dtype: While we load and store the model in 4-bit,
 69 |         we will partially dequantize it when needed and do all the computations with a 16-bit precision (bfloat16).
 70 |     """
 71 |     # So now we can load the model in 4-bit:
 72 |     model = AutoModelForCausalLM.from_pretrained(
 73 |         model_id, quantization_config=bnb_config, device_map={'': 0})
 74 | 
 75 |     # Then, we enable gradient checkpointing, to reduce the memory footprint of the model:
 76 |     model.gradient_checkpointing_enable()
 77 |     # Then, we load the tokenizer:
 78 |     if model.config.model_type == 'llama':
 79 |         # Due to the name of Transformers' LlamaTokenizer, we have to do this
 80 |         tokenizer = LlamaTokenizer.from_pretrained(
 81 |             model_id,
 82 |             padding_side='right',
 83 |             use_fast=True,
 84 |         )
 85 |     else:
 86 |         tokenizer = AutoTokenizer.from_pretrained(
 87 |             model_id,
 88 |             padding_side='right',
 89 |             use_fast=True,
 90 |         )
 91 |     # Preprocessing the GPT model for LoRa
 92 |     model = prepare_model_for_kbit_training(model)
 93 |     # This is where we use PEFT. We prepare the model for LoRa, adding trainable adapters for each layer.
 94 |     config = LoraConfig(
 95 |         r=8,
 96 |         lora_alpha=32,
 97 |         target_modules=['q_proj', 'v_proj'],
 98 |         lora_dropout=0.05,
 99 |         bias='none',
100 |         task_type='CAUSAL_LM',
101 |     )
102 |     # We can now add the adapters to the model:
103 |     model = get_peft_model(model, config)
104 |     # We can now print the number of trainable parameters in the model:
105 |     print_trainable_parameters(model)
106 | 
107 |     # Get your dataset ready
108 |     # For this demo, I use the “english_quotes” dataset. This is a dataset made of famous quotes distributed under a CC BY 4.0 license.
109 |     data = load_dataset('Abirate/english_quotes')
110 |     data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
111 | 
112 |     # Add special tokens to tokenizer if they are not already present
113 |     special_tokens_dict: Dict[str, str] = {}
114 |     if tokenizer.pad_token is None:
115 |         special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
116 |     if tokenizer.eos_token is None:
117 |         special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
118 |     if tokenizer.bos_token is None:
119 |         special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
120 |     if tokenizer.unk_token is None:
121 |         special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
122 | 
123 |     smart_tokenizer_and_embedding_resize(
124 |         special_tokens_dict=special_tokens_dict,
125 |         tokenizer=tokenizer,
126 |         model=model,
127 |     )
128 | 
129 |     trainer = Trainer(
130 |         model=model,
131 |         train_dataset=data['train'],
132 |         args=TrainingArguments(
133 |             per_device_train_batch_size=4,
134 |             gradient_accumulation_steps=8,
135 |             warmup_steps=2,
136 |             max_steps=1000,
137 |             learning_rate=2e-4,
138 |             fp16=True,
139 |             logging_steps=1,
140 |             output_dir='outputs',
141 |             optim='paged_adamw_8bit',
142 |         ),
143 |         data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
144 |     )
145 |     model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
146 |     trainer.train()
147 | 


--------------------------------------------------------------------------------